In [None]:
# Import packages 
import pandas as pd
import os
import yaml

# Local modules
from module.data_loader import OsirisData
from module.dataset import *
from module.modeling import randomforestregressormodel_train, lassoregressionmodel_train, supportvectormachinemodel_train
from module.modeling import randomforestregressormodel_pred, lassoregressionmodel_pred, supportvectormachinemodel_pred
from module.features import *
from module.settings import load_settings

In [None]:
# Choose which settings to load
settings_type = 'custom'  # [default,custom] Change to 'custom' to load custom settings
dataloader = "db" # Change to 'file' for file input 

In [None]:
config_file = 'config.yaml'
settings = load_settings(config_file, settings_type)
print(settings)

In [None]:
# Check if train.csv and pred.csv exist in user_data folder, otherwise load synthetic datasests
# When user data is loaded and an error occurs here, please check if the sep = '\t' needs to be changed in the config.yaml file to another separator like ',' or '.'
# This should be the same as the separator used in your .csv file

if dataloader == 'file':
    if os.path.exists(settings.user_data_dir_train) and os.path.exists(settings.user_data_dir_pred):
        train_path,pred_path = user_data_dir_train, user_data_dir_pred
        print ('User datasets found')
    else:
        train_path,pred_path = settings.synth_data_dir_train, settings.synth_data_dir_pred
        print ('Pre-uploaded synthetic datasets found')
        
    # train_df = pd.read_csv(train_path, sep = separator, engine='python')
    pred_df = pd.read_csv(pred_path, sep = separator, engine='python')
elif dataloader == 'db':
    pred_cohort = 2024
    train_cohort_min = 2023
    db = OsirisData(env_path="./")
    # train_df = db.get_dataset(team="BB",min_cohort=train_cohort_min,max_cohort=pred_cohort-1)
    pred_df = db.get_dataset(team="BB",min_cohort=pred_cohort,max_cohort=pred_cohort)

KeyboardInterrupt: 

In [None]:
if settings.retrain_models == True:
    if dataloader == 'file':
        train_df = pd.read_csv(train_path, sep = separator, engine='python')
    elif dataloader == 'db':
        pred_cohort = 2024
        train_cohort_min = 2023
        db = OsirisData(env_path="./")
        train_df = db.get_dataset(team="BB",min_cohort=train_cohort_min,max_cohort=pred_cohort-1)
        # pred_df = db.get_dataset(team="BB",min_cohort=pred_cohort,max_cohort=pred_cohort)

In [None]:
# Basic data cleaning: drop rows that are duplicate and change any NA values to the average value of the column it's in. 
if settings.retrain_models == True:
    train_basic_cl = basic_cleaning (train_df)
pred_basic_cl = basic_cleaning (pred_df)

In [None]:
# Detect if there are columns in which all rows have the same value and delete these columns from the train and predict datasets 
if settings.retrain_models == True:
    train_cleaned, pred_cleaned = remove_single_value_columns(train_basic_cl, pred_basic_cl)
else: 
    train_cleaned, pred_cleaned = remove_single_value_columns(pred_basic_cl, pred_basic_cl)

In [None]:
train_cleaned[settings.studentnumber_column]=train_cleaned[settings.studentnumber_column].astype("int")
pred_cleaned[settings.studentnumber_column]=pred_cleaned[settings.studentnumber_column].astype("int")

In [None]:
# Apply function that changes categorical data into numerical data so it can be used as input for the models 
train_processed, pred_processed = convert_categorical_to_dummies(train_cleaned, pred_cleaned, settings.dropout_column, settings.separator)

In [None]:
# Use the function standardize_min_max to standardize the train and pred datasets using a min max scaler and save them as .csv files in the folder data/interim 
# These datasets can be used for the lasso and svm models, because reggression is sensitive to scaling 
train_df_sdd, pred_df_sdd = standardize_dataset(train_processed, pred_processed, settings.dropout_column, settings.separator)

In [None]:
# Code checks if retrain_models = True or False in config.yaml file. 
# When using your own datasets, change retrain_models in the config.yaml file to True, so the models are trained on your own data. 
# Warning: training the models can take a long time depending on the size and contents of your data. 
if settings.retrain_models == True:
    print ('Training models on the data...')
    best_rf_model = randomforestregressormodel_train(
        train_processed, settings.random_seed
        , settings.dropout_column, settings.rf_parameters)
    best_lasso_model = lassoregressionmodel_train(
        train_df_sdd, settings.random_seed
        , settings.dropout_column, settings.lasso_parameters)
    best_svm_model = supportvectormachinemodel_train(
        train_df_sdd, settings.random_seed
        , settings.dropout_column, settings.svm_parameters)
else:
    print('retrain_models is False in the config.yaml file, loading the pre-trained models')
# Folds = number of train/test splits of the dataset, candidates = models with different parameters and fits = folds * candidates

In [None]:
pred_processed.columns

In [None]:
# Use the loaded models to predict on the datasets. 
# The lasso and SVM models use the standardized dataset ot predict an, but take the student numnbers from the 
# regular predict dataset. 
ranked_students_rf = randomforestregressormodel_pred (pred_processed, dropout_column, studentnumber_column)
ranked_students_lasso = lassoregressionmodel_pred(pred_df_sdd, pred_processed, settings.dropout_column, settings.studentnumber_column)
ranked_students_svm = supportvectormachinemodel_pred(pred_df_sdd, pred_processed, settings.dropout_column, settings.studentnumber_column)

In [None]:
# Save the output files as either .xlsx or as three .csv files 
if settings.save_method == 'xlsx':
    writer = pd.ExcelWriter('models/predictions/ranked_students.xlsx', engine='xlsxwriter')
    ranked_students_rf.to_excel(writer, sheet_name='Random Forest', startrow=0, startcol=0, index=False)
    ranked_students_lasso.to_excel(writer, sheet_name='Lasso', startrow=0, startcol=0, index=False)
    ranked_students_svm.to_excel(writer, sheet_name='Support Vector Machine', startrow=0, startcol=0, index=False)
    writer.close()
    print ('Output file saved as .xlsx in the /models/predictions folder')
elif settings.save_method == 'csv':
    ranked_students_rf.to_csv('models/predictions/csv_output/ranked_students_rf.csv', sep='\t', index=False)
    ranked_students_lasso.to_csv('models/predictions/csv_output/ranked_students_lasso.csv', sep='\t', index=False)
    ranked_students_svm.to_csv('models/predictions/csv_output/ranked_students_svm.csv', sep='\t', index=False)
    print ('Output files saved as .csv in the /models/predictions/csv_output folder')
else:
    print('Invalid save method. For save_method in the config.yaml file, please fill in "xlsx" or "csv"')