In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from scripts.load_data import *
from scripts.availabledata import *
from scripts.individual import *
from scripts.cumulative import *
from scripts.bothdatasets import *
from scripts.dataoption import *
from scripts.higher_years import *

In [3]:
configuration = load_configuration("../configuration/configuration.json")

data_individual, data_cumulative, data_student_numbers_first_years, data_student_numbers_higher_years, data_student_numbers_volume, data_latest, data_distances, ensemble_weights = load_data(configuration, StudentYearPrediction.FIRST_YEARS)

dataholder = Cumulative(data_cumulative, data_student_numbers_first_years, configuration)

data_cumulative = dataholder.preprocess()

numerus_fixus_list = configuration["numerus_fixus"]

  data_cumulative = pd.read_csv(paths["path_cumulative"], sep=';', skiprows=[1]) if (paths["path_cumulative"] != "" and os.path.exists(paths["path_cumulative"])) else None
  data[key] = data[key].str.replace('.', '')


In [4]:
predict_year = 2023
skip_years = 1
predict_week = 15

In [5]:
data_to_predict = data_cumulative[(data_cumulative["Collegejaar"] == predict_year - skip_years) &
                                               (data_cumulative["Weeknummer"] == predict_week)]

dataholder.prepare_data()

In [6]:
def predict_with_xgboost(data, programme, examtype, herkomst, predict_year, skip_years=0):
    try:
        # Train/test split
        if programme not in numerus_fixus_list:
            train = data[(data["Collegejaar"] < predict_year - skip_years) & (data["Examentype"] == examtype) & (
                ~data['Croho groepeernaam'].isin(numerus_fixus_list))]
        elif programme in numerus_fixus_list:
            train = data[(data["Collegejaar"] < predict_year - skip_years) & (data['Croho groepeernaam'] == programme)]

        test = data[(data["Collegejaar"] == predict_year) & (data['Croho groepeernaam'] == programme) & (data["Herkomst"] == herkomst)]

        if data_student_numbers_first_years is not None:
            train = train.merge(data_student_numbers_first_years[['Croho groepeernaam', 'Collegejaar', 'Herkomst', 'Aantal_studenten']],
                                on=['Croho groepeernaam', 'Collegejaar', 'Herkomst'])
        else:
            # Student count is required
            return np.nan

        train = train.drop_duplicates()

        X_train = train.drop(['Aantal_studenten'], axis=1)
        y_train = train.pop('Aantal_studenten')

        # Encode
        # Specify the numeric and categorical column names
        numeric_cols = ['Collegejaar'] + [str(x) for x in get_weeks_list(38)]
        categorical_cols = ['Examentype', 'Faculteit', 'Croho groepeernaam', 'Herkomst']

        # Create transformers for numeric and categorical columns
        numeric_transformer = "passthrough"  # No transformation for numeric columns
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        # Create the column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('numeric', numeric_transformer, numeric_cols),
                ('categorical', categorical_transformer, categorical_cols)
            ])

        # Apply the preprocessing to the training and test data
        X_train = preprocessor.fit_transform(X_train)
        test = preprocessor.transform(test)
        # Model
        model = XGBRegressor(learning_rate=0.25)

        model.fit(X_train, y_train)

        predictie = model.predict(test)

        return int(round(predictie[0], 0))
    except ValueError:
        print(f"Cumulative xgboost error on: {programme}, {herkomst}")
        return np.nan

In [7]:
data_count = data_student_numbers_first_years.copy()

In [8]:
data = dataholder.data_cumulative.copy()

data = data.drop_duplicates()

data = data[data["Collegejaar"] >= 2016]

full_data = transform_data(data, 'ts')

total_mae = 0.0
total_mape = 0.0
count = 0.0

for _, row in data_to_predict.iterrows():
    programme = row['Croho groepeernaam']
    herkomst = row["Herkomst"]
    examtype = row["Examentype"]

    gc.collect()

    data = full_data.copy()

    data = data[data["Herkomst"] == herkomst]

    data = data[data["Collegejaar"] <= predict_year - skip_years]

    data = data[data['Croho groepeernaam'] == programme]

    if int(predict_week) > 38:
        pred_len = 38 + 52 - int(predict_week)
    else:
        pred_len = 38 - int(predict_week)

    previous_pred_len = pred_len
    pred_len += 52

    # Week 39 to 0
    data['39'] = 0
    full_data['39'] = 0

    def create_time_series(data: pd.DataFrame, pred_len: int) -> np.array:
        """
        Create a time series data array from a DataFrame for a given prediction length.

        Args:
            data (pd.DataFrame): The input DataFrame containing time series data.
            pred_len (int): The length of the time series to be created.

        Returns:
            np.ndarray: A NumPy array containing the time series data.
        """

        ts_data = data.loc[:, get_all_weeks_valid(data.columns)].values.flatten()
        ts_data = ts_data[:-pred_len]

        return np.array(ts_data)

    ts_data = create_time_series(data, pred_len)
    # if predict_week == 38:
    #     prediction = _predict_with_xgboost(full_data, programme, examtype, herkomst)
    #     return prediction

    try:
        model = sm.tsa.statespace.SARIMAX(ts_data, order=(1, 0, 1), seasonal_order=(1, 1, 1, 52))
        results = model.fit(disp=0)

        pred = results.forecast(steps=pred_len)

        index = str(increment_week(predict_week))

        full_data.loc[
        (full_data["Collegejaar"] == predict_year - skip_years) & (full_data['Croho groepeernaam'] == programme) & (full_data["Herkomst"] == herkomst),
        index:'38'] = pred[:previous_pred_len]

        full_data.loc[
        (full_data["Collegejaar"] == predict_year) & (full_data['Croho groepeernaam'] == programme) & (full_data["Herkomst"] == herkomst),
        '39':'38'] = pred[-52:]

        # prediction = _predict_with_xgboost(full_data, programme, examtype, herkomst)
        # return prediction, pred

        prediction_first_year = predict_with_xgboost(full_data, programme, examtype, herkomst, predict_year - skip_years)

        data_count.loc[(data_count["Collegejaar"] == predict_year - skip_years) &
                       (data_count["Croho groepeernaam"] == programme) &
                       (data_count["Herkomst"] == herkomst), "Aantal_studenten"] = prediction_first_year

        final_prediction = predict_with_xgboost(full_data, programme, examtype, herkomst, predict_year, skip_years)

        nr_of_students = data_student_numbers_first_years[(data_student_numbers_first_years["Collegejaar"] == predict_year) & (data_student_numbers_first_years["Croho groepeernaam"] == programme) & 
                                                          (data_student_numbers_first_years["Herkomst"] == herkomst)]["Aantal_studenten"]

        mae = "???"
        mape = "???"
        if len(nr_of_students) > 0:
            mae = abs(nr_of_students.iloc[0] - final_prediction)
            mape = abs((nr_of_students.iloc[0] - final_prediction) / nr_of_students.iloc[0])

            total_mae += mae
            total_mape += mape
            count += 1.0

        print(f"{programme}, {herkomst}, {final_prediction}, MAE: {mae}, MAPE: {mape}")

    except (LA.LinAlgError, IndexError, ValueError) as error:
        print(f"Cumulative sarima error on: {programme}, {herkomst}")
        print(error)

print(f"Final MAE: {total_mae / count}")
print(f"Final MAPE: {total_mape / count}")

full_data.to_excel("full_data.xlsx", index=False)
data_count.to_excel("data_count.xlsx", index=False)

B Algemene Cultuurwetenschappen, EER, 16, MAE: 2, MAPE: 0.14285714285714285
B Algemene Cultuurwetenschappen, NL, 30, MAE: 1, MAPE: 0.034482758620689655
B Algemene Cultuurwetenschappen, Niet-EER, 5, MAE: 1, MAPE: 0.25
B Artificial Intelligence, EER, 30, MAE: 17, MAPE: 0.3617021276595745
B Artificial Intelligence, NL, 120, MAE: 33, MAPE: 0.21568627450980393
B Artificial Intelligence, Niet-EER, 9, MAE: 6, MAPE: 0.4
B Artificial Intelligence, onbekend, 4, MAE: ???, MAPE: ???
B Bedrijfskunde, EER, 19, MAE: 10, MAPE: 0.3448275862068966
B Bedrijfskunde, NL, 366, MAE: 15, MAPE: 0.03937007874015748
B Bedrijfskunde, Niet-EER, 18, MAE: 7, MAPE: 0.6363636363636364
B Bestuurskunde, NL, 80, MAE: 3, MAPE: 0.03614457831325301
B Biology, EER, 160, MAE: 112, MAPE: 2.3333333333333335
B Biology, NL, 64, MAE: 26, MAPE: 0.28888888888888886
B Biology, Niet-EER, 29, MAE: 24, MAPE: 4.8
B Biomedische Wetenschappen, EER, 1, MAE: ???, MAPE: ???
B Biomedische Wetenschappen, NL, 98, MAE: 3, MAPE: 0.0315789473684210

In [9]:
total_mae = 0.0
total_mape = 0.0
count = 0.0

for _, row in data_to_predict.iterrows():
    programme = row['Croho groepeernaam']
    herkomst = row["Herkomst"]
    examtype = row["Examentype"]

    prediction = predict_with_xgboost(full_data, programme, examtype, herkomst, predict_year)

    nr_of_students = data_student_numbers_first_years[(data_student_numbers_first_years["Collegejaar"] == predict_year) & (data_student_numbers_first_years["Croho groepeernaam"] == programme) & 
                                                          (data_student_numbers_first_years["Herkomst"] == herkomst)]["Aantal_studenten"]

    mae = "???"
    mape = "???"
    if len(nr_of_students) > 0:
        mae = abs(nr_of_students.iloc[0] - final_prediction)
        mape = abs((nr_of_students.iloc[0] - final_prediction) / nr_of_students.iloc[0])

        total_mae += mae
        total_mape += mape
        count += 1.0

    print(f"{programme}, {herkomst}, {final_prediction}, MAE: {mae}, MAPE: {mape}")

print(f"Final MAE: {total_mae / count}")
print(f"Final MAPE: {total_mape / count}")

B Algemene Cultuurwetenschappen, EER, 2, MAE: 12, MAPE: 0.8571428571428571
B Algemene Cultuurwetenschappen, NL, 2, MAE: 27, MAPE: 0.9310344827586207
B Algemene Cultuurwetenschappen, Niet-EER, 2, MAE: 2, MAPE: 0.5
B Artificial Intelligence, EER, 2, MAE: 45, MAPE: 0.9574468085106383
B Artificial Intelligence, NL, 2, MAE: 151, MAPE: 0.9869281045751634
B Artificial Intelligence, Niet-EER, 2, MAE: 13, MAPE: 0.8666666666666667
B Artificial Intelligence, onbekend, 2, MAE: ???, MAPE: ???
B Bedrijfskunde, EER, 2, MAE: 27, MAPE: 0.9310344827586207
B Bedrijfskunde, NL, 2, MAE: 379, MAPE: 0.994750656167979
B Bedrijfskunde, Niet-EER, 2, MAE: 9, MAPE: 0.8181818181818182
B Bestuurskunde, NL, 2, MAE: 81, MAPE: 0.9759036144578314
B Biology, EER, 2, MAE: 46, MAPE: 0.9583333333333334
B Biology, NL, 2, MAE: 88, MAPE: 0.9777777777777777
B Biology, Niet-EER, 2, MAE: 3, MAPE: 0.6
B Biomedische Wetenschappen, EER, 2, MAE: ???, MAPE: ???
B Biomedische Wetenschappen, NL, 2, MAE: 93, MAPE: 0.9789473684210527
B B