In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import json
import numpy as np
from scripts.load_data import *

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

In [3]:
configuration = load_configuration("../configuration/configuration.json")

paths = configuration["paths"]
data_student_numbers_f = pd.read_excel(paths["path_student_count_first-years"])
data_student_numbers_h = pd.read_excel(paths["path_student_count_higher-years"])
data_student_numbers_v = pd.read_excel(paths["path_student_volume"])

In [4]:
combined_data_student_numbers = data_student_numbers_f.merge(data_student_numbers_h, on=["Croho groepeernaam", "Herkomst", "Collegejaar"])
combined_data_student_numbers = combined_data_student_numbers.rename(columns={"Aantal_studenten_x": "Aantal_studenten_f", "Aantal_studenten_y": "Aantal_studenten_h"})
combined_data_student_numbers

Unnamed: 0,Collegejaar,Croho groepeernaam,Herkomst,Aantal_studenten_f,Aantal_studenten_h
0,2012,M Geneeskunde,NL,180,868
1,2013,M Geneeskunde,NL,173,892
2,2014,M Geneeskunde,NL,175,916
3,2015,M Geneeskunde,NL,21,1086
4,2016,M Geneeskunde,NL,148,984
...,...,...,...,...,...
2176,2016,M Leraar Voorbereidend Hoger Onderwijs in Gods...,NL,1,1
2177,2014,M Leraar Voorbereidend Hoger Onderwijs in Natu...,NL,3,1
2178,2015,M Leraar Voorbereidend Hoger Onderwijs in Natu...,NL,1,1
2179,2017,M Leraar Voorbereidend Hoger Onderwijs in Natu...,NL,2,1


In [5]:
combined_data_student_numbers[combined_data_student_numbers["Croho groepeernaam"] == "B Sociologie"]

Unnamed: 0,Collegejaar,Croho groepeernaam,Herkomst,Aantal_studenten_f,Aantal_studenten_h
1839,2012,B Sociologie,NL,49,76
1840,2013,B Sociologie,NL,51,88
1841,2014,B Sociologie,NL,38,108
1842,2015,B Sociologie,NL,32,105
1843,2016,B Sociologie,NL,44,95
1844,2017,B Sociologie,NL,31,98
1845,2018,B Sociologie,NL,43,89
1846,2019,B Sociologie,NL,48,99
1847,2020,B Sociologie,NL,63,99
1848,2021,B Sociologie,NL,86,111


In [6]:
predict_year = 2023
programme = "B Sociologie"
origin = "NL"
examtype = "Bac"

In [7]:
data = combined_data_student_numbers
train = data[(data["Collegejaar"] < predict_year)]
# test = data[(data["Collegejaar"] == predict_year) & (data['Croho groepeernaam'] == programme) & (data["Herkomst"] == origin)].drop(['Aantal_studenten_h'], axis=1)
test = data[(data["Collegejaar"] == predict_year)]

train = train.drop_duplicates()

In [8]:
X_train = train.drop(['Aantal_studenten_h'], axis=1)
y_train = train.pop('Aantal_studenten_h')

In [9]:
test = test.sort_values(by=["Croho groepeernaam", "Herkomst"])

In [10]:
numeric_cols = ['Collegejaar', 'Aantal_studenten_f']
categorical_cols = ['Croho groepeernaam', 'Herkomst']

numeric_transformer = "passthrough"  # No transformation for numeric columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_cols),
        ('categorical', categorical_transformer, categorical_cols)
    ])

X_train = preprocessor.fit_transform(X_train)
preprocessed_test = preprocessor.transform(test)

In [11]:
print(preprocessed_test.shape)
print(len(test))

(184, 123)
184


In [12]:
model = XGBRegressor(learning_rate=0.5301857237491415)

model.fit(X_train, y_train)

predictie = model.predict(preprocessed_test)

print(predictie)
print(len(predictie))

[ 1.74556808e+01  7.87479248e+01  7.54550648e+00  8.53702850e+01
  4.24321869e+02  3.79704666e+01  4.66843529e+01  1.03795898e+03
  2.31126308e+01  2.19896851e+02  1.22567451e+00  8.92513885e+01
  3.84758942e+02  9.57721233e+00  1.97820023e+02  1.47732992e+01
  7.52086563e+01  5.37330532e+00  4.44352074e+01  2.57732422e+02
  9.15403843e+00  2.60088104e+02  9.17752075e+01  1.98235825e+02
  3.23124466e+01  1.42888580e+02  2.83129768e+01  3.33025551e+01
  3.23153137e+02  1.59814043e+01  1.22074680e+01  1.39136719e+02
  3.44570160e+00  4.31895065e+00  1.21288658e+02  4.34033699e+01
  2.41289551e+02  9.24123096e+00  8.65229034e+00  8.27553833e+02
  8.37690163e+00  3.84616364e+02  5.65618706e+00  1.78440208e+01
  3.12039001e+02  7.20487547e+00  4.56097794e+01  8.08649731e+00
  5.49731522e+01  2.39084663e+01  2.00622604e+02  1.06193275e+01
  6.52622795e+00  2.46823990e+02  5.85099525e+01  8.54417877e+01
  2.13084045e+02  6.63019043e+02  4.44488106e+01  2.68779278e+01
  2.75257416e+02  2.89272

In [18]:
total_mae = 0.0
total_mape = 0.0
i = 0
count_mape = 0.0
for _, row in test.iterrows():
    programme = row["Croho groepeernaam"]
    origin = row["Herkomst"]
    higher_year_row = data_student_numbers_h[(data_student_numbers_h["Croho groepeernaam"] == programme) &
                                                    (data_student_numbers_h["Herkomst"] == origin) &
                                                    (data_student_numbers_h["Collegejaar"] == predict_year)]

    mae = abs(higher_year_row["Aantal_studenten"].iloc[0] - predictie[i])
    mape = abs((higher_year_row["Aantal_studenten"].iloc[0] - predictie[i]) / higher_year_row["Aantal_studenten"].iloc[0])
    
    print(f"{programme}, {origin}: {mae} & {mape} => {predictie[i]} & {higher_year_row['Aantal_studenten'].iloc[0]}")

    if predictie[i] >= 10:
        total_mape += mape
        count_mape += 1.0

    total_mae += mae
    i += 1

print(f"Final MAE: {total_mae / i}")
print(f"Final MAPE: {total_mape / count_mape}")

print(f"Total MAE: {total_mae}")
print(f"Total MAPE: {total_mape}")
print(f"Count: {i} & {count_mape}")

B Algemene Cultuurwetenschappen, EER: 18.54431915283203 & 0.5151199764675565 => 17.45568084716797 & 36
B Algemene Cultuurwetenschappen, NL: 1.7479248046875 & 0.0227003221387987 => 78.7479248046875 & 77
B Algemene Cultuurwetenschappen, Niet-EER: 1.454493522644043 & 0.16161039140489367 => 7.545506477355957 & 9
B Artificial Intelligence, EER: 1.3702850341796875 & 0.016312917073567707 => 85.37028503417969 & 84
B Artificial Intelligence, NL: 149.32186889648438 & 0.5429886141690341 => 424.3218688964844 & 275
B Artificial Intelligence, Niet-EER: 21.97046661376953 & 1.3731541633605957 => 37.97046661376953 & 16
B Bedrijfskunde, EER: 33.31564712524414 & 0.41644558906555174 => 46.68435287475586 & 80
B Bedrijfskunde, NL: 22.958984375 & 0.022619689039408867 => 1037.958984375 & 1015
B Bedrijfskunde, Niet-EER: 5.112630844116211 & 0.28403504689534503 => 23.11263084411621 & 18
B Bestuurskunde, NL: 25.8968505859375 & 0.1334889205460696 => 219.8968505859375 & 194
B Bestuurskunde, Niet-EER: 0.225674510002

In [14]:
test

Unnamed: 0,Collegejaar,Croho groepeernaam,Herkomst,Aantal_studenten_f,Aantal_studenten_h
1257,2023,B Algemene Cultuurwetenschappen,EER,14,36
1245,2023,B Algemene Cultuurwetenschappen,NL,29,77
1265,2023,B Algemene Cultuurwetenschappen,Niet-EER,4,9
586,2023,B Artificial Intelligence,EER,47,84
574,2023,B Artificial Intelligence,NL,153,275
...,...,...,...,...,...
422,2023,M Taalwetenschappen,Niet-EER,2,1
347,2023,M Tandheelkunde,EER,1,6
336,2023,M Tandheelkunde,NL,63,156
1864,2023,M Theologie,NL,3,5
