In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
import os

2023-10-22 01:24:59.025414: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-22 01:24:59.047200: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load main DF and create a list of all courses

In [1]:
quota_path = "../backend/data/quota.csv"

In [3]:
df = pd.read_csv(quota_path)

df['date'] = pd.to_datetime(df['date'], format="ISO8601")

courses_list = df['initials'].unique().tolist()
print(f"Amount of total courses: {len(courses_list)}.")
print(f"Sample: {courses_list[:5]}")


  df = pd.read_csv(quota_path)


Amount of total courses: 8449.
Sample: ['IIQ3643', 'IIQ3663', 'IIQ3683', 'IIQ3733', 'IIQ3763']


## Create dataframes for each inscription period

In [4]:
date_ranges = [
    ('2021-1', '2021-01-22', '2021-01-30'),
    ('2021-2', '2021-08-02', '2021-08-13'),
    ('2021-3', '2021-12-28', '2021-12-28'),
    ('2022-1', '2022-01-06', '2022-01-18'),
    ('2022-2', '2022-07-25', '2022-08-05'),
    ('2022-3', '2022-12-20', '2022-12-20'),
    ('2023-1', '2023-01-05', '2023-01-17'),
    ('2023-2', '2023-07-21', '2023-08-04')
]

dfs_dict = {}

# Iterate through the date ranges and extract the corresponding data
for name, start_date, end_date in date_ranges:
    mask = (df['date'] >= start_date) & (df['date'] <= end_date) & (df['category'] == 'Vacantes libres')
    dfs_dict[name] = df[mask].copy()

## For each of the inscription periods, perform a filter of min timesteps

In [5]:
min_timesteps = 15

In [6]:
courses_timesteps = {}

for df_name, df_val in dfs_dict.items():
    print(f"Working on inscription period {df_name} (", end="")
    
    grouped = df_val.groupby("section_id")
    filtered_df = grouped.filter(lambda group: len(group) > min_timesteps)
    print(f"{filtered_df['section_id'].nunique()} elements after filter)")

    for initials, group in filtered_df.groupby(['initials', 'section_id']):
        initials, section_id = initials
        date_quota_list = list(zip(group['date'], group['quota']))
        
        # Check if the 'initials' key already exists in the dictionary
        if initials not in courses_timesteps:
            courses_timesteps[initials] = {section_id: date_quota_list}
        else:
            # Check if the 'section_id' key already exists for the 'initials'
            if section_id not in courses_timesteps[initials]:
                courses_timesteps[initials][section_id] = date_quota_list
            else:
                courses_timesteps[initials][section_id].extend(date_quota_list)


Working on inscription period 2021-1 (5203 elements after filter)
Working on inscription period 2021-2 (5244 elements after filter)
Working on inscription period 2021-3 (0 elements after filter)
Working on inscription period 2022-1 (5036 elements after filter)
Working on inscription period 2022-2 (0 elements after filter)
Working on inscription period 2022-3 (0 elements after filter)
Working on inscription period 2023-1 (5135 elements after filter)
Working on inscription period 2023-2 (0 elements after filter)


In [7]:
courses_timesteps['ACO256E'][19123][0]

(Timestamp('2021-01-22 05:30:01.446136+0000', tz='UTC'), 17.0)

In [8]:
for course in list(courses_timesteps.keys()):
    for nrc in list(courses_timesteps[course].keys()):
        if sum([x for _, x in courses_timesteps[course][nrc]]) == 0:
            del courses_timesteps[course][nrc]
            continue
        
        base_timestamp = pd.Timestamp(courses_timesteps[course][nrc][0][0].date())

        for ts_id in range(len(courses_timesteps[course][nrc])):
            courses_timesteps[course][nrc][ts_id] =((courses_timesteps[course][nrc][ts_id][0].replace(tzinfo=None) - base_timestamp).to_pytimedelta(), courses_timesteps[course][nrc][ts_id][1])
        

In [9]:
type(courses_timesteps['ACO256E'][19123][0][0])

datetime.timedelta

## Perform Regression (2nd grade)

#### Flatten courses to a simple list

In [10]:
flat_courses = {}

for course, course_data in list(courses_timesteps.items()):
    for nrc, timestamp_value in course_data.items():
        if course in flat_courses:
            flat_courses[course].extend(list(timestamp_value))
        else:
            flat_courses[course] = list(timestamp_value)


#### Perform regression

In [17]:
def build_model(x_values_normalized, y_values_normalized):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(7, activation='relu', input_shape=(1,)))
    model.add(tf.keras.layers.Dense(7, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
        
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Define EarlyStopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='loss',  # Monitor validation loss
        patience=20,          # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
    )

    # Train the model with early stopping
    history = model.fit(
        x_values_normalized, y_values_normalized,
        epochs=10,
        callbacks=[early_stopping],  # Pass the EarlyStopping callback
        verbose=0
    )
    
    return model, history

In [18]:
save_loc_template = "../backend/models/model_{course}.keras"

print(save_loc_template.format(course="test"))

../backend/models/model_test.keras


In [None]:
# data = flat_courses['ACO256E']

for idx, (course_name, data) in enumerate(flat_courses.items()):

    print(f"Training {course_name}. ({idx}/{len(flat_courses.keys())})", end="")

    course_name_parsed = course_name.strip()

    save_loc = save_loc_template.format(course=course_name_parsed)

    print(save_loc)

    if os.path.exists(save_loc):
        print(" Already trained, skipping...")
        continue
        
    print("")
    
    x_values = [x.total_seconds() / 86400.0 for x, y in data]  # Convert timedelta to days (86400 seconds per day)
    y_values = [y for x, y in data]
    
    x_values = np.array(x_values)
    y_values = np.array(y_values)
    
    # Normalize x and y
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()
    
    x_values_normalized = scaler_x.fit_transform(x_values.reshape(-1, 1))
    y_values_normalized = scaler_y.fit_transform(y_values.reshape(-1, 1))
    
    # Create a single plot for all combinations
    # plt.figure(figsize=(10, 6))
    
    model, hist = build_model(x_values_normalized, y_values_normalized)

    if hist.history['loss'][-1] > 0.1:
        print(f"Loss too big {course_name}: {hist.history['loss'][-1]}")
        continue
    
    # Generate equally spaced x values for the plot
    # x_new = np.linspace(min(x_values), max(x_values), 500)
    # x_new_normalized = scaler_x.transform(x_new.reshape(-1, 1))
    
    # Make predictions for the new x values
    #predictions_normalized = model.predict(x_new_normalized)
    
    # Inverse transform the normalized predictions to get the actual predictions in the original scale
    # predictions = scaler_y.inverse_transform(predictions_normalized)

    model.save(save_loc)
    
    # Plot the predictions for the current combination with a different color
    # plt.plot(x_new, predictions, label=f'Neurons: (1x7x7x1)')
    
    # plt.scatter(x_values, y_values, label='Data Points', color='black', marker='o')
    # plt.xlabel('Time (days)')
    # plt.ylabel('Y Values')
    # plt.legend()
    # plt.grid(True)
    # plt.show()

Training ACO256E. (0/2124)../backend/models/model_ACO256E.keras

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 

In [237]:
hist.history['loss'][-1]

0.06424558907747269