In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
import os
import joblib

## Load main DF and create a list of all courses

In [2]:
quota_path = "../backend/data/quota.csv"

In [3]:
df = pd.read_csv(quota_path)

df['date'] = pd.to_datetime(df['date'], format="ISO8601")

courses_list = df['initials'].unique().tolist()
print(f"Amount of total courses: {len(courses_list)}.")
print(f"Sample: {courses_list[:5]}")


  df = pd.read_csv(quota_path)


Amount of total courses: 8449.
Sample: ['IIQ3643', 'IIQ3663', 'IIQ3683', 'IIQ3733', 'IIQ3763']


## Create dataframes for each inscription period

In [4]:
date_ranges = [
    ('2021-1', '2021-01-22', '2021-01-30'),
    ('2021-2', '2021-08-02', '2021-08-13'),
    ('2021-3', '2021-12-28', '2021-12-28'),
    ('2022-1', '2022-01-06', '2022-01-18'),
    ('2022-2', '2022-07-25', '2022-08-05'),
    ('2022-3', '2022-12-20', '2022-12-20'),
    ('2023-1', '2023-01-05', '2023-01-17'),
    ('2023-2', '2023-07-21', '2023-08-04')
]

dfs_dict = {}

# Iterate through the date ranges and extract the corresponding data
for name, start_date, end_date in date_ranges:
    mask = (df['date'] >= start_date) & (df['date'] <= end_date) & (df['category'] == 'Vacantes libres')
    dfs_dict[name] = df[mask].copy()

## For each of the inscription periods, perform a filter of min timesteps

In [5]:
min_timesteps = 15

In [6]:
courses_timesteps = {}

for df_name, df_val in dfs_dict.items():
    print(f"Working on inscription period {df_name} (", end="")
    
    grouped = df_val.groupby("section_id")
    filtered_df = grouped.filter(lambda group: len(group) > min_timesteps)
    print(f"{filtered_df['section_id'].nunique()} elements after filter)")

    for initials, group in filtered_df.groupby(['initials', 'section_id']):
        initials, section_id = initials
        date_quota_list = list(zip(group['date'], group['quota']))
        
        # Check if the 'initials' key already exists in the dictionary
        if initials not in courses_timesteps:
            courses_timesteps[initials] = {section_id: date_quota_list}
        else:
            # Check if the 'section_id' key already exists for the 'initials'
            if section_id not in courses_timesteps[initials]:
                courses_timesteps[initials][section_id] = date_quota_list
            else:
                courses_timesteps[initials][section_id].extend(date_quota_list)


Working on inscription period 2021-1 (5203 elements after filter)
Working on inscription period 2021-2 (5244 elements after filter)
Working on inscription period 2021-3 (0 elements after filter)
Working on inscription period 2022-1 (5036 elements after filter)
Working on inscription period 2022-2 (0 elements after filter)
Working on inscription period 2022-3 (0 elements after filter)
Working on inscription period 2023-1 (5135 elements after filter)
Working on inscription period 2023-2 (0 elements after filter)


In [7]:
courses_timesteps['ACO256E'][19123][0]

(Timestamp('2021-01-22 05:30:01.446136+0000', tz='UTC'), 17.0)

In [8]:
for course in list(courses_timesteps.keys()):
    for nrc in list(courses_timesteps[course].keys()):
        if sum([x for _, x in courses_timesteps[course][nrc]]) == 0:
            del courses_timesteps[course][nrc]
            continue
        
        base_timestamp = pd.Timestamp(courses_timesteps[course][nrc][0][0].date())

        for ts_id in range(len(courses_timesteps[course][nrc])):
            courses_timesteps[course][nrc][ts_id] =((courses_timesteps[course][nrc][ts_id][0].replace(tzinfo=None) - base_timestamp).to_pytimedelta(), courses_timesteps[course][nrc][ts_id][1])
        

In [9]:
type(courses_timesteps['ACO256E'][19123][0][0])

datetime.timedelta

## Perform Regression (2nd grade)

#### Flatten courses to a simple list

In [10]:
flat_courses = {}

for course, course_data in list(courses_timesteps.items()):
    for nrc, timestamp_value in course_data.items():
        if course in flat_courses:
            flat_courses[course].extend(list(timestamp_value))
        else:
            flat_courses[course] = list(timestamp_value)


#### Perform regression

In [11]:
def build_model(x_values_normalized, y_values_normalized):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(7, activation='relu', input_shape=(1,)))
    model.add(tf.keras.layers.Dense(7, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
        
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Define EarlyStopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='loss',  # Monitor validation loss
        patience=20,          # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
    )

    # Train the model with early stopping
    history = model.fit(
        x_values_normalized, y_values_normalized,
        epochs=1000,
        callbacks=[early_stopping],  # Pass the EarlyStopping callback
        verbose=0
    )
    
    return model, history

In [26]:
save_loc_template = "../backend/models/model_{course}.keras"
save_loc_template_scalerx = "../backend/models/scaler_x_{course}.pkl"
save_loc_template_scalery = "../backend/models/scaler_y_{course}.pkl"

print(save_loc_template.format(course="test"))
print(save_loc_template_scalerx.format(course="test"))
print(save_loc_template_scalery.format(course="test"))

../backend/models/model_test.keras
../backend/models/scaler_x_test.pkl
../backend/models/scaler_y_test.pkl


In [31]:
# data = flat_courses['ACO256E']

for idx, (course_name, data) in enumerate(list(flat_courses.items())):

    print(f"Training {course_name}. ({idx}/{len(flat_courses.keys())})", end="")

    course_name_parsed = course_name.strip()

    save_loc = save_loc_template.format(course=course_name_parsed)
    save_loc_scalerx = save_loc_template_scalerx.format(course=course_name_parsed)
    save_loc_scalery = save_loc_template_scalery.format(course=course_name_parsed)

    print(save_loc)

    #if os.path.exists(save_loc):
    #    print(" Already trained, skipping...")
    #    continue
        
    print("")
    
    x_values = [x.total_seconds() / 86400.0 for x, y in data]  # Convert timedelta to days (86400 seconds per day)
    y_values = [y for x, y in data]
    
    x_values = np.array(x_values)
    y_values = np.array(y_values)
    
    # Normalize x and y
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()
    
    x_values_normalized = scaler_x.fit_transform(x_values.reshape(-1, 1))
    y_values_normalized = scaler_y.fit_transform(y_values.reshape(-1, 1))

    joblib.dump(scaler_x, save_loc_scalerx)
    joblib.dump(scaler_y, save_loc_scalery)
    
    # Create a single plot for all combinations
    
    # model, hist = build_model(x_values_normalized, y_values_normalized)

    #if hist.history['loss'][-1] > 0.1:
    #    print(f"Loss too big {course_name}: {hist.history['loss'][-1]}")
    #    continue
    
    # Inverse transform the normalized predictions to get the actual predictions in the original scale
    # predictions = scaler_y.inverse_transform(predictions_normalized)

    #model.save(save_loc)
    
    # Plot the predictions for the current combination with a different color
    # plt.plot(x_new, predictions, label=f'Neurons: (1x7x7x1)')
    
    # plt.scatter(x_values, y_values, label='Data Points', color='black', marker='o')
    # plt.xlabel('Time (days)')
    # plt.ylabel('Y Values')
    # plt.legend()
    # plt.grid(True)
    # plt.show()

Training ACO256E. (0/2124)../backend/models/model_ACO256E.keras

Training ACT114. (1/2124)../backend/models/model_ACT114.keras

Training ACT115. (2/2124)../backend/models/model_ACT115.keras

Training ACT1307. (3/2124)../backend/models/model_ACT1307.keras

Training ACT1311. (4/2124)../backend/models/model_ACT1311.keras

Training ACT1314. (5/2124)../backend/models/model_ACT1314.keras

Training ACT1315. (6/2124)../backend/models/model_ACT1315.keras

Training ADU4049. (7/2124)../backend/models/model_ADU4049.keras

Training ADU4102. (8/2124)../backend/models/model_ADU4102.keras

Training ADU4108. (9/2124)../backend/models/model_ADU4108.keras

Training ADU4123. (10/2124)../backend/models/model_ADU4123.keras

Training AGC204. (11/2124)../backend/models/model_AGC204.keras

Training AGC206. (12/2124)../backend/models/model_AGC206.keras

Training AGC220. (13/2124)../backend/models/model_AGC220.keras

Training AGC261. (14/2124)../backend/models/model_AGC261.keras

Training AGC3002. (15/2124)../ba

In [14]:
hist.history['loss'][-1]

NameError: name 'hist' is not defined

# Inference

In [38]:
# Load the saved model
inf_course = "ACO256E"

# Values for which you want to make predictions
values_to_predict = [0.2, 0.4, 1.1, 1.5, 1.5, 1.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]


predictions = load_and_predict(inf_course, values_to_predict)


# Print the predictions
for i in range(len(values_to_predict)):
    print(f"Input: {values_to_predict[i]}, Predicted: {predictions[i]}")

Input: 0.2, Predicted: 18.223651885986328
Input: 0.4, Predicted: 17.338207244873047
Input: 1.1, Predicted: 10.50616455078125
Input: 1.5, Predicted: 6.211840629577637
Input: 1.5, Predicted: 6.211840629577637
Input: 1.5, Predicted: 6.211840629577637
Input: 1.5, Predicted: 6.211840629577637
Input: 2.5, Predicted: 0.5795793533325195
Input: 3.5, Predicted: -0.2550211250782013
Input: 4.5, Predicted: 0.08752582967281342
Input: 5.5, Predicted: 0.43007275462150574
Input: 6.5, Predicted: 0.46399471163749695
Input: 7.5, Predicted: 0.36841675639152527
Input: 8.5, Predicted: 0.2728376090526581


In [51]:


save_loc_template = "../backend/models/model_{course}.keras"

for course in courses_to_predict:
    save_loc = save_loc_template.format(course=course)

    if not os.path.exists(save_loc):
        print(f"Course {course} doesn't exist")

In [61]:

sections = {
    '1': (1.1, ),
    '2': (0.3, 1.5, 5.4, 6.7),
    '3': (),
    '4': (),
    '5': (),
    '6': (),
    '7': (),
    '8': (),
    '9': (),
    '10': (),
    '11': (),
    '12': (),
    '13': (),
    '14': (),
    '15': (),
    '16': ()
}


In [66]:
section = '2'

time_data = sections[section]

course_scores = {}

for course in courses_to_predict:
    predictions = load_and_predict(course, time_data)
    course_scores[course] = predictions



In [67]:
course_scores

{'FIS1514': [1.9992809, 2.000297, 1.6604093, 0.8939156],
 'IIC2133': [21.750856, 21.365196, 0.81372815, 0.6081818],
 'IBM2101': [12.403321, 12.403321, 4.725217, 1.8451596],
 'IBM2992': [7.990554, 8.001373, 8.010752, 7.9937167],
 'EYP1025': [5.0001054, 4.999946, 5.000029, 4.990941],
 'IMT2565': [25.305004, 24.912561, 21.020332, 19.995024]}

In [84]:
def calculate_score(course_scores, config):
    score = 0
    for idx, course in enumerate(config):
        if idx < 3:
            score += course_scores[course][0]
        else:
            score += course_scores[course][1]
        # To-Do: Usar los reajustes
        
    return score

def find_best_order(course_scores, current_config = None):

    if current_config is None:
        current_config = []

    if len(course_scores.keys()) == len(current_config):
        sssscore = calculate_score(course_scores, current_config)
        #print(f"Reached max with score {sssscore} and config: {current_config}")
        return current_config, sssscore
    
    best_score = -9999
    best_config = current_config
    
    for course in course_scores.keys():
        if course in current_config:
            continue
            
        my_config = current_config.copy()
        my_config.append(course)
        new_config, new_score = find_best_order(course_scores, my_config)
        if new_score > best_score:
            best_config = new_config
            best_score = new_score

    return best_config, best_score
        

In [87]:
save_loc_template = "../backend/models/model_{course}.keras"

sections = {#(1ra_vuelta, 2da_vuelta, 1er_reajuste, 2do_reajuste)
    '1': ((8/24), (8/24)+5, (18.5/24)+12, (8/24)+14),
    '2': ((9.5/24)+3, (9.5/24)+5, (17/24)+12, (8/24)+14),
    '3': ((11/24)+3, (11/24)+5, (15.5/24)+12, (9.5/24)+14),
    '4': ((12.5/24)+3, (12.5/24)+5, (14/24)+12, (9.5/24)+14),
    '5': ((14/24)+3, (14/24)+5, (12.5/24)+12, (11/24)+14), #14:00
    '6': ((15.5/24)+3,(15.5/24)+5, (11/24)+12, (11/24)+14),
    '7': ((17/24)+3, (17/24)+5, (9.5/24)+12, (12.5/24)+14),
    '8': ((18.5/24)+3, (18.5/24)+5, (8/24)+12, (12.5/24)+14),
    '9': ((8/24)+4, (8/24)+6, (18.5/24)+11, (14/24)+14),
    '10': ((9.5/24)+4, (9.5/24)+6, (17/24)+11, (14/24)+14),
    '11': ((11/24)+4, (11/24)+6, (15.5/24)+11, (15.5/24)+14),
    '12': ((12.5/24)+4, (12.5/24)+6, (14/24)+11, (15.5/24)+14),
    '13': ((14/24)+4, (14/24)+6, (12.5/24)+11, (17/24)+14),
    '14': ((15.5/24)+4, (15.5/24)+6, (11/24)+11, (17/24)+14),
    '15': ((17/24)+4, (17/24)+6, (9.5/24)+11, (18.5/24)+14),
    '16': ((18.5/24)+4, (18.5/24)+6, (8/24)+11, (18.5/24)+14)
}


def get_best_order(courses, section: int):
    for course in courses_to_predict:
        save_loc = save_loc_template.format(course=course)

        if not os.path.exists(save_loc):
            raise ValueError(f"Course {course} doesn't exist in our database yet.")

    if section < 1 or section > 16:
        raise ValueError(f"Invalid section: {section}")

    section = str(section)
    time_data = sections[section]
    
    course_scores = {}
    
    for course in courses_to_predict:
        predictions = load_and_predict(course, time_data)
        course_scores[course] = predictions

    return find_best_order(course_scores)

In [88]:
courses_to_predict = ['IIC2133', 'IBM2101', 'IBM2992', 'EYP1025', 'IMT2565', 'FIS1514']

get_best_order(courses_to_predict, section=2)[0]



['IIC2133', 'IBM2101', 'IMT2565', 'IBM2992', 'EYP1025', 'FIS1514']

In [None]:
# Generate linearly spaced values
values_to_predict = np.linspace(start_value, end_value, num_values)

In [17]:
x_values_normalized

array([[-1.40314654],
       [-1.22514691],
       [-1.17471061],
       [-1.14045686],
       [-1.10505019],
       [-1.07101482],
       [-1.01914686],
       [-0.82391987],
       [-0.74776449],
       [-0.71289386],
       [-0.67846584],
       [-0.6437503 ],
       [-0.60919318],
       [ 0.42464998],
       [ 0.5962136 ],
       [ 0.60620998],
       [ 0.82877541],
       [ 0.85477511],
       [ 0.88092433],
       [ 0.9066775 ],
       [ 0.93284899],
       [ 0.95869645],
       [ 0.98469059],
       [ 1.01899861],
       [ 1.47362463],
       [ 1.88757513]])

In [18]:
x_values

array([0.13659963, 0.56500924, 0.68639934, 0.76884129, 0.85405806,
       0.93597441, 1.06081025, 1.53068266, 1.71397346, 1.7979001 ,
       1.88076147, 1.96431486, 2.04748693, 4.53574113, 4.94866052,
       4.97271982, 5.5083904 , 5.57096648, 5.63390243, 5.69588518,
       5.75887472, 5.8210844 , 5.88364711, 5.96621966, 7.06041377,
       8.05671002])

In [19]:
scaler_x

In [22]:
scaler_x.scale_

array([0.52440442])

In [4]:
directory_path = '../backend/models/'  # Replace with the actual path to your "tmp" directory

# Use glob to find files starting with "model_"
file_list = glob.glob(f'{directory_path}/model_*')

# Print the list of matching files
for file_path in file_list:
    file_name = os.path.basename(file_path)
    print(file_name[6:-6])

EAA326C
IEE3724
SOL166
DER501L
EDU375F
TBI012
QPG3220
EST006
DER513R
AGF337
AST1016
AGR319
EPG3382
DPT5901
IIC3585
TBI013
ODO302A
DER611L
QPG3170
FIM3436
MED819
GEO303
EFS1030
GEO204
DPT5808
GOB1001
IIC3182
LET3502
MPG3325
IMT2112
MAT1224
ICP5703
BIO340D
IMT2115
LET247E
MAT1226
AGR311
SOL130
ADU4108
DER509S
LET002G
DER512F
LET122E
MCS3010
AGZ354
DPT6270
EYP2707
EST233C
AQH0200
AGF323
EYP2417
LET1024
SOL226
MAT2555
TPR306
EAM622
ICE2640
AGR3002
EPG3344
AGC3610
EAE285A
AGZ304
FIL2005
EST3120
EST253E
IBM3103
EAM471
FIM3120
DER506S
EPG4100
IIC3143
MPT3045
TSL587
FAR4005
IBM2992
MPG5000
IMI3700
ANT230
AGZ320
AGF322
ICM2028
ANT404R
EDU3899
ICH2103
ICP0204
EAM446
AGC3779
EDU4033
MDO4507
FIM4545
LET100E
MPT3026
DER514F
AGC3668
MPG3225
MEB174
AQI0106
ICP0202
ASP9301
QIM100
MAT380I
IHI0506
MAT2095
EAM524
AGR3019
ACO4001
AGE3009
AGP4015
DER505C
ARO107E
GEO3086
ICP0720
AGZ3424
MPG4300
IMM2103
DER513L
EDU0277
TTF023
AGZ3418
IIQ3652
AGP3134
IEU2051
DER505R
MUC745
MAT1306
MAT2225
ASP9303
TSL589
LET44