In [54]:
# Import packages 
import pandas as pd
import os

In [56]:
# Import .py scripts from subdirectories 
from module.config import *
from module.dataset import *
from module.modeling.train import *
from module.features import *

In [58]:
# Check if train.csv and pred.csv exist in user_data folder, otherwise load synthetic datasests
if os.path.exists(user_data_dir_train) and os.path.exists(user_data_dir_pred):
    train_df = pd.read_csv(user_data_dir_train, sep = '\t')
    pred_df = pd.read_csv(user_data_dir_pred, sep = '\t')
else:
    train_df = pd.read_csv(synth_data_dir_train, sep = '\t')
    pred_df = pd.read_csv(synth_data_dir_pred, sep = '\t')

In [60]:
# Data cleaning: drop rows that are duplicate and change any NA values to the average value of the column it's in. 
cleaned_train = basic_cleaning (train_df)
cleaned_pred = basic_cleaning (pred_df)

In [62]:
# Use the function standardize_min_max to standardize the train and pred datasets using a min max scaler and save them as .csv files in the folder data/interim 
# These datasets can be used for the lasso regression model, because reggression is sensitive to scaling 
standardize_dataset (cleaned_train, cleaned_pred)

# Load the standardized datasets
train_df_sdd = pd.read_csv(standardized_data_train, sep = '\t')
pred_df_sdd = pd.read_csv(standardized_data_pred, sep = '\t')

In [8]:
# Run the models with GridsearchCV for optimization and save the fitted models in the folder '/models/'
best_rf_model = randomforestregressormodel_train (cleaned_train)
best_lasso_model = lassoregressionmodel_train (train_df_sdd)
best_svm_model = supportvectormachinemodel_train (cleaned_train)

# Folds = dataset split variations, Candidates = candidate models based on different parameters, fits  = folds * candidates. 

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Fitting 5 folds for each of 199 candidates, totalling 995 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [10]:
# Import code that loads the trained models and that can predict on the dataset
from module.modeling.predict import *

In [15]:
# Use the loaded models to predict on the dataset
ranked_students_rf = randomforestregressormodel_pred (cleaned_pred)
ranked_students_lasso = lassoregressionmodel_pred(pred_df_sdd, cleaned_pred)
ranked_students_svm = supportvectormachinemodel_pred(cleaned_pred)

In [28]:
# Save results as excel file
writer = pd.ExcelWriter('models/predictions/ranked_students.xlsx', engine='xlsxwriter')
ranked_students_rf.to_excel(writer, sheet_name='Random Forest', startrow=0, startcol=0, index=False)
ranked_students_lasso.to_excel(writer, sheet_name='Lasso', startrow=0, startcol=0, index=False)
ranked_students_svm.to_excel(writer, sheet_name='Support Vector Machine', startrow=0, startcol=0, index=False)
writer.close()