In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import json
import numpy as np
import itertools
from scripts.load_data import *
import optuna

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
from pandas import read_csv
import math
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
configuration = load_configuration("../configuration/configuration.json")

paths = configuration["paths"]
data_student_numbers_f = pd.read_excel(paths["path_student_count_first-years"])
data_student_numbers_h = pd.read_excel(paths["path_student_count_higher-years"])
data_student_numbers_v = pd.read_excel(paths["path_student_volume"])

In [4]:
combined_data_student_numbers = data_student_numbers_f.merge(data_student_numbers_h, on=["Croho groepeernaam", "Herkomst", "Collegejaar"])
combined_data_student_numbers = combined_data_student_numbers.rename(columns={"Aantal_studenten_x": "Aantal_studenten_f", "Aantal_studenten_y": "Aantal_studenten_h"})
combined_data_student_numbers

Unnamed: 0,Collegejaar,Croho groepeernaam,Herkomst,Aantal_studenten_f,Aantal_studenten_h
0,2012,M Geneeskunde,NL,180,868
1,2013,M Geneeskunde,NL,173,892
2,2014,M Geneeskunde,NL,175,916
3,2015,M Geneeskunde,NL,21,1086
4,2016,M Geneeskunde,NL,148,984
...,...,...,...,...,...
2176,2016,M Leraar Voorbereidend Hoger Onderwijs in Gods...,NL,1,1
2177,2014,M Leraar Voorbereidend Hoger Onderwijs in Natu...,NL,3,1
2178,2015,M Leraar Voorbereidend Hoger Onderwijs in Natu...,NL,1,1
2179,2017,M Leraar Voorbereidend Hoger Onderwijs in Natu...,NL,2,1


In [5]:
combined_data_student_numbers[combined_data_student_numbers["Croho groepeernaam"] == "B Sociologie"]

Unnamed: 0,Collegejaar,Croho groepeernaam,Herkomst,Aantal_studenten_f,Aantal_studenten_h
1839,2012,B Sociologie,NL,49,76
1840,2013,B Sociologie,NL,51,88
1841,2014,B Sociologie,NL,38,108
1842,2015,B Sociologie,NL,32,105
1843,2016,B Sociologie,NL,44,95
1844,2017,B Sociologie,NL,31,98
1845,2018,B Sociologie,NL,43,89
1846,2019,B Sociologie,NL,48,99
1847,2020,B Sociologie,NL,63,99
1848,2021,B Sociologie,NL,86,111


In [6]:
predict_year = 2023

all_unique_years = np.sort(combined_data_student_numbers["Collegejaar"].unique())

In [7]:
def objective(trial):
    look_back = trial.suggest_int("look_back", 1, 10)
    nr_of_epochs = trial.suggest_int("nr_of_epochs", 3, 40)

    total_mae = 0.0
    count = 0.0

    for programme, origin in itertools.product(np.sort(combined_data_student_numbers["Croho groepeernaam"].unique()), np.sort(combined_data_student_numbers["Herkomst"].unique())):
        data = combined_data_student_numbers
        data = data.sort_values(by=["Collegejaar"])
        train = data[(data["Collegejaar"] < predict_year) & (data['Croho groepeernaam'] == programme) & (data["Herkomst"] == origin)]
        true_label = data[(data["Collegejaar"] == predict_year) & (data['Croho groepeernaam'] == programme) & (data["Herkomst"] == origin)]
        if len(true_label) == 0 or len(train) == 0:
            continue

        train_dataset = []
        for year in all_unique_years:
            year_train = train[train["Collegejaar"] == year]
            if len(year_train) == 0:
                train_dataset.append([0])
            else:
                train_dataset.append([year_train["Aantal_studenten_h"].iloc[0]])

        def create_dataset(dataset, look_back=1):
            dataX, dataY = [], []
            for i in range(len(dataset)-look_back-1):
                a = dataset[i:(i+look_back), 0]
                dataX.append(a)
                dataY.append(dataset[i + look_back, 0])
            return np.array(dataX), np.array(dataY)

        scaler = MinMaxScaler(feature_range=(0, 1))
        train_dataset = scaler.fit_transform(train_dataset)

        X_train, Y_train = create_dataset(train_dataset, look_back)

        trainX = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))

        model = Sequential()
        model.add(Input(shape=(1, look_back)))
        model.add(LSTM(units=4))
        model.add(Dense(8, activation='relu'))
        model.add(Dense(1, activation='linear'))

        model.compile(loss='mean_squared_error', optimizer='adam')
        model.fit(trainX, Y_train, epochs=nr_of_epochs, batch_size=1, verbose=0)

        test = train_dataset[-look_back:]

        reshaped_test = np.reshape(test, (1, 1, look_back))

        prediction = model.predict(reshaped_test, verbose=0)
        prediction = scaler.inverse_transform(prediction)
        
        mae = abs(prediction - true_label["Aantal_studenten_h"].iloc[0])
        print(f"{programme}, {origin}: {prediction} => {mae}")

        total_mae += mae
        count += 1.0

    final_mae = total_mae / count
    print(f"Final MAE: {final_mae}")
    print(final_mae[0][0])

In [8]:
study = optuna.create_study()
study.optimize(objective, n_trials=20)

print(study.best_params)

# MAE: 27.77, look_back: 6 en epochs: 35

[I 2024-05-14 11:10:04,031] A new study created in memory with name: no-name-c45d5571-b3ef-4e18-8f4e-e721222a5e8d


B Algemene Cultuurwetenschappen, EER: [[3.9627576]] => [[32.037243]]
B Algemene Cultuurwetenschappen, NL: [[23.762774]] => [[53.23723]]
B Algemene Cultuurwetenschappen, Niet-EER: [[1.6670116]] => [[7.3329883]]
B Artificial Intelligence, EER: [[11.7804365]] => [[72.21957]]
B Artificial Intelligence, NL: [[49.950603]] => [[225.0494]]
B Artificial Intelligence, Niet-EER: [[2.4955137]] => [[13.504486]]
B Bedrijfskunde, EER: [[9.170171]] => [[70.82983]]
B Bedrijfskunde, NL: [[160.03485]] => [[854.96515]]
B Bedrijfskunde, Niet-EER: [[3.0797994]] => [[14.9202]]
B Bestuurskunde, NL: [[63.527252]] => [[130.47275]]
B Biology, EER: [[21.560995]] => [[102.439]]
B Biology, NL: [[120.33598]] => [[120.66402]]
B Biology, Niet-EER: [[2.2835546]] => [[20.716446]]
B Biomedische Wetenschappen, NL: [[44.56723]] => [[149.43277]]
B Chemistry, EER: [[2.6607108]] => [[22.339289]]
B Chemistry, NL: [[8.780792]] => [[68.21921]]
B Chemistry, Niet-EER: [[0.6869769]] => [[9.313023]]
B Communicatie- en Informatiewete

[W 2024-05-14 11:21:08,130] Trial 0 failed with parameters: {'look_back': 1, 'nr_of_epochs': 7} because of the following error: The value None could not be cast to float..
[W 2024-05-14 11:21:08,132] Trial 0 failed with value None.


M Theologie & Religiewetenschappen, NL: [[20.899323]] => [[120.10068]]
Final MAE: [[76.73262]]
76.73262
B Algemene Cultuurwetenschappen, EER: [[13.335342]] => [[22.664658]]
B Algemene Cultuurwetenschappen, NL: [[58.379265]] => [[18.620735]]
B Algemene Cultuurwetenschappen, Niet-EER: [[6.429933]] => [[2.570067]]
B Artificial Intelligence, EER: [[46.25935]] => [[37.74065]]
B Artificial Intelligence, NL: [[194.89386]] => [[80.10614]]
B Artificial Intelligence, Niet-EER: [[6.9728317]] => [[9.027168]]
B Bedrijfskunde, EER: [[27.798317]] => [[52.201683]]
B Bedrijfskunde, NL: [[530.9948]] => [[484.0052]]
B Bedrijfskunde, Niet-EER: [[13.975779]] => [[4.0242214]]
B Bestuurskunde, NL: [[161.26099]] => [[32.739014]]
B Biology, EER: [[58.17598]] => [[65.82402]]
B Biology, NL: [[229.94653]] => [[11.053467]]
B Biology, Niet-EER: [[12.264069]] => [[10.735931]]
B Biomedische Wetenschappen, NL: [[141.55263]] => [[52.447372]]
B Chemistry, EER: [[9.477796]] => [[15.522204]]
B Chemistry, NL: [[86.35307]] 

[W 2024-05-14 11:36:09,902] Trial 1 failed with parameters: {'look_back': 2, 'nr_of_epochs': 40} because of the following error: The value None could not be cast to float..
[W 2024-05-14 11:36:09,902] Trial 1 failed with value None.


M Theologie & Religiewetenschappen, NL: [[45.034718]] => [[95.96529]]
Final MAE: [[30.024712]]
30.024712
B Algemene Cultuurwetenschappen, EER: [[24.838947]] => [[11.161053]]
B Algemene Cultuurwetenschappen, NL: [[71.13737]] => [[5.8626328]]
B Algemene Cultuurwetenschappen, Niet-EER: [[6.929493]] => [[2.070507]]
B Artificial Intelligence, EER: [[8.863233]] => [[75.136765]]
B Artificial Intelligence, NL: [[286.04248]] => [[11.04248]]
B Artificial Intelligence, Niet-EER: [[11.876474]] => [[4.1235256]]
B Bedrijfskunde, EER: [[41.594852]] => [[38.405148]]
B Bedrijfskunde, NL: [[869.09515]] => [[145.90485]]
B Bedrijfskunde, Niet-EER: [[13.753535]] => [[4.2464647]]
B Bestuurskunde, NL: [[176.67906]] => [[17.320938]]
B Biology, EER: [[25.56623]] => [[98.43377]]
B Biology, NL: [[202.92265]] => [[38.077347]]
B Biology, Niet-EER: [[6.202424]] => [[16.797577]]
B Biomedische Wetenschappen, NL: [[141.70593]] => [[52.294067]]
B Chemistry, EER: [[12.579385]] => [[12.420615]]
B Chemistry, NL: [[55.2253

[W 2024-05-14 11:47:35,663] Trial 2 failed with parameters: {'look_back': 6, 'nr_of_epochs': 25} because of the following error: The value None could not be cast to float..
[W 2024-05-14 11:47:35,664] Trial 2 failed with value None.


M Theologie & Religiewetenschappen, NL: [[26.91066]] => [[114.08934]]
Final MAE: [[32.2958]]
32.2958
B Algemene Cultuurwetenschappen, EER: [[9.050674]] => [[26.949326]]
B Algemene Cultuurwetenschappen, NL: [[52.157158]] => [[24.842842]]
B Algemene Cultuurwetenschappen, Niet-EER: [[2.2157857]] => [[6.784214]]
B Artificial Intelligence, EER: [[67.650154]] => [[16.349846]]
B Artificial Intelligence, NL: [[246.36658]] => [[28.633423]]
B Artificial Intelligence, Niet-EER: [[12.09456]] => [[3.9054403]]
B Bedrijfskunde, EER: [[19.113785]] => [[60.886215]]
B Bedrijfskunde, NL: [[131.47365]] => [[883.52637]]
B Bedrijfskunde, Niet-EER: [[18.949327]] => [[0.94932747]]
B Bestuurskunde, NL: [[260.47]] => [[66.47]]
B Biology, EER: [[40.887333]] => [[83.11267]]
B Biology, NL: [[231.16536]] => [[9.8346405]]
B Biology, Niet-EER: [[7.952628]] => [[15.047372]]
B Biomedische Wetenschappen, NL: [[182.8302]] => [[11.1698]]
B Chemistry, EER: [[3.7017515]] => [[21.298248]]
B Chemistry, NL: [[59.05855]] => [[1

[W 2024-05-14 11:58:30,467] Trial 3 failed with parameters: {'look_back': 8, 'nr_of_epochs': 36} because of the following error: The value None could not be cast to float..
[W 2024-05-14 11:58:30,482] Trial 3 failed with value None.


M Theologie & Religiewetenschappen, NL: [[83.84917]] => [[57.150833]]
Final MAE: [[42.850437]]
42.850437
B Algemene Cultuurwetenschappen, EER: [[5.5107155]] => [[30.489285]]
B Algemene Cultuurwetenschappen, NL: [[25.376623]] => [[51.623375]]
B Algemene Cultuurwetenschappen, Niet-EER: [[3.4976304]] => [[5.50237]]
B Artificial Intelligence, EER: [[5.111751]] => [[78.88825]]
B Artificial Intelligence, NL: [[43.675945]] => [[231.32405]]
B Artificial Intelligence, Niet-EER: [[16.044313]] => [[0.04431343]]
B Bedrijfskunde, EER: [[21.685724]] => [[58.314278]]
B Bedrijfskunde, NL: [[378.88544]] => [[636.11456]]
B Bedrijfskunde, Niet-EER: [[11.046712]] => [[6.953288]]
B Bestuurskunde, NL: [[49.07719]] => [[144.9228]]
B Biology, EER: [[10.796428]] => [[113.203575]]
B Biology, NL: [[227.57605]] => [[13.42395]]
B Biology, Niet-EER: [[8.642786]] => [[14.357214]]
B Biomedische Wetenschappen, NL: [[93.24949]] => [[100.75051]]
B Chemistry, EER: [[3.9249988]] => [[21.075]]
B Chemistry, NL: [[69.84944]]

[W 2024-05-14 12:09:29,324] Trial 4 failed with parameters: {'look_back': 9, 'nr_of_epochs': 38} because of the following error: The value None could not be cast to float..
[W 2024-05-14 12:09:29,324] Trial 4 failed with value None.


M Theologie & Religiewetenschappen, NL: [[113.7574]] => [[27.2426]]
Final MAE: [[52.24965]]
52.24965
B Algemene Cultuurwetenschappen, EER: [[5.2837467]] => [[30.716253]]
B Algemene Cultuurwetenschappen, NL: [[60.936905]] => [[16.063095]]
B Algemene Cultuurwetenschappen, Niet-EER: [[1.4723067]] => [[7.5276933]]
B Artificial Intelligence, EER: [[13.869466]] => [[70.13053]]
B Artificial Intelligence, NL: [[70.912285]] => [[204.0877]]
B Artificial Intelligence, Niet-EER: [[3.6902182]] => [[12.309782]]
B Bedrijfskunde, EER: [[11.915547]] => [[68.08445]]
B Bedrijfskunde, NL: [[771.3584]] => [[243.6416]]
B Bedrijfskunde, Niet-EER: [[4.273185]] => [[13.726815]]
B Bestuurskunde, NL: [[57.156948]] => [[136.84305]]
B Biology, EER: [[19.25552]] => [[104.74448]]
B Biology, NL: [[104.11204]] => [[136.88797]]
B Biology, Niet-EER: [[4.934241]] => [[18.06576]]
B Biomedische Wetenschappen, NL: [[128.99745]] => [[65.00255]]
B Chemistry, EER: [[5.9191356]] => [[19.080864]]
B Chemistry, NL: [[74.16663]] =>

[W 2024-05-14 12:18:28,915] Trial 5 failed with parameters: {'look_back': 8, 'nr_of_epochs': 17} because of the following error: The value None could not be cast to float..
[W 2024-05-14 12:18:28,915] Trial 5 failed with value None.


M Theologie & Religiewetenschappen, NL: [[59.260098]] => [[81.7399]]
Final MAE: [[66.92527]]
66.92527
B Algemene Cultuurwetenschappen, EER: [[27.545233]] => [[8.454767]]
B Algemene Cultuurwetenschappen, NL: [[63.60535]] => [[13.3946495]]
B Algemene Cultuurwetenschappen, Niet-EER: [[7.1843705]] => [[1.8156295]]
B Artificial Intelligence, EER: [[56.555767]] => [[27.444233]]
B Artificial Intelligence, NL: [[307.75327]] => [[32.753265]]
B Artificial Intelligence, Niet-EER: [[17.847946]] => [[1.8479462]]
B Bedrijfskunde, EER: [[40.135483]] => [[39.864517]]
B Bedrijfskunde, NL: [[778.4985]] => [[236.50153]]
B Bedrijfskunde, Niet-EER: [[20.056814]] => [[2.0568142]]
B Bestuurskunde, NL: [[131.6701]] => [[62.329895]]
B Biology, EER: [[14.131929]] => [[109.86807]]
B Biology, NL: [[304.88852]] => [[63.88852]]
B Biology, Niet-EER: [[5.058627]] => [[17.941372]]
B Biomedische Wetenschappen, NL: [[185.3428]] => [[8.657196]]
B Chemistry, EER: [[16.329363]] => [[8.670637]]
B Chemistry, NL: [[70.7186]] 

[W 2024-05-14 12:29:38,555] Trial 6 failed with parameters: {'look_back': 6, 'nr_of_epochs': 35} because of the following error: The value None could not be cast to float..
[W 2024-05-14 12:29:38,555] Trial 6 failed with value None.


M Theologie & Religiewetenschappen, NL: [[123.83129]] => [[17.168709]]
Final MAE: [[27.767878]]
27.767878
B Algemene Cultuurwetenschappen, EER: [[8.559854]] => [[27.440147]]
B Algemene Cultuurwetenschappen, NL: [[12.608907]] => [[64.39109]]
B Algemene Cultuurwetenschappen, Niet-EER: [[0.8501573]] => [[8.149842]]
B Artificial Intelligence, EER: [[9.214594]] => [[74.78541]]
B Artificial Intelligence, NL: [[64.07275]] => [[210.92725]]
B Artificial Intelligence, Niet-EER: [[6.1387296]] => [[9.861271]]
B Bedrijfskunde, EER: [[3.533026]] => [[76.46697]]
B Bedrijfskunde, NL: [[48.542076]] => [[966.45795]]
B Bedrijfskunde, Niet-EER: [[2.7734835]] => [[15.226517]]
B Bestuurskunde, NL: [[36.574196]] => [[157.42581]]
B Biology, EER: [[4.4928875]] => [[119.50711]]
B Biology, NL: [[36.483932]] => [[204.51607]]
B Biology, Niet-EER: [[1.7614069]] => [[21.238594]]
B Biomedische Wetenschappen, NL: [[4.116093]] => [[189.88391]]
B Chemistry, EER: [[-1.2829472]] => [[26.282948]]
B Chemistry, NL: [[35.9614

[W 2024-05-14 12:38:36,899] Trial 7 failed with parameters: {'look_back': 4, 'nr_of_epochs': 6} because of the following error: The value None could not be cast to float..
[W 2024-05-14 12:38:36,899] Trial 7 failed with value None.


M Theologie & Religiewetenschappen, NL: [[17.240732]] => [[123.75927]]
Final MAE: [[74.349884]]
74.349884
B Algemene Cultuurwetenschappen, EER: [[0.9020811]] => [[35.09792]]
B Algemene Cultuurwetenschappen, NL: [[34.83618]] => [[42.16382]]
B Algemene Cultuurwetenschappen, Niet-EER: [[5.24397]] => [[3.75603]]
B Artificial Intelligence, EER: [[47.54152]] => [[36.45848]]
B Artificial Intelligence, NL: [[84.81932]] => [[190.18068]]
B Artificial Intelligence, Niet-EER: [[4.2974544]] => [[11.702545]]
B Bedrijfskunde, EER: [[10.413194]] => [[69.58681]]
B Bedrijfskunde, NL: [[191.2193]] => [[823.7807]]
B Bedrijfskunde, Niet-EER: [[2.406589]] => [[15.5934105]]
B Bestuurskunde, NL: [[60.72]] => [[133.28]]
B Biology, EER: [[17.544827]] => [[106.45517]]
B Biology, NL: [[169.74658]] => [[71.25342]]
B Biology, Niet-EER: [[7.1703577]] => [[15.829642]]
B Biomedische Wetenschappen, NL: [[53.838707]] => [[140.16129]]
B Chemistry, EER: [[4.20773]] => [[20.79227]]
B Chemistry, NL: [[53.034466]] => [[23.96

[W 2024-05-14 12:43:30,055] Trial 8 failed with parameters: {'look_back': 9, 'nr_of_epochs': 31} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\jjble\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\jjble\AppData\Local\Temp\ipykernel_15516\3314709244.py", line 46, in objective
    model.fit(trainX, Y_train, epochs=nr_of_epochs, batch_size=1, verbose=0)
  File "c:\Users\jjble\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jjble\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 325, in fit
    logs = self.train_function(iterator)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jjble\anaconda3\Lib\site-packages\tensorflow\python\util\traceback_utils.py", line 15

KeyboardInterrupt: 