In [4]:
# Import packages 
import pandas as pd
import os

In [5]:
## Import .py scripts from subdirectories 

# Import config file that contains settings that can be adjusted
from module.config import *
# Import script with basic data cleaning: drop duplicate rows and change NA values of numerical columns to the column mean
from module.dataset import *
# Import script that contains 3 models that train on the train dataset. Tuning is done with GridsearchCV. The models are Random Forest (RF),
# Lasso Regression (lasso) and Support Vector Machines (SVM)
from module.modeling.train import *
# Import python script that contains feature engineering. The first function checks a dataset for categorical columns and changes them
# using dummy variables. The second function standardizes the data using a minmax scaler. This is needed for the lasso and SVM models
from module.features import *

In [6]:
# Check if train.csv and pred.csv exist in user_data folder, otherwise load synthetic datasests
# When user data is loaded and an error occurs here, please check if the sep = '\t' needs to be changed to another seperator like ',' or '.'
# This should be the same as the seperator used in your .csv file
if os.path.exists(user_data_dir_train) and os.path.exists(user_data_dir_pred):
    train_df = pd.read_csv(user_data_dir_train, sep = '\t') # Change seperator if needed
    pred_df = pd.read_csv(user_data_dir_pred, sep = '\t') # Change seperator if needed
    print ('User datasets found and loaded')
else:
    train_df = pd.read_csv(synth_data_dir_train, sep = '\t')
    pred_df = pd.read_csv(synth_data_dir_pred, sep = '\t')
    print ('Pre-uploaded synthetic datasets found and loaded')

Pre-uploaded synthetic datasets found and loaded


In [8]:
# Basic data cleaning: drop rows that are duplicate and change any NA values to the average value of the column it's in. 
train_cleaned = basic_cleaning (train_df)
pred_cleaned = basic_cleaning (pred_df)

In [9]:
# Apply function that changes categorical data into numerical data so it can be used as input for the models 
train_processed, pred_processed = convert_categorical_to_dummies (train_cleaned, pred_cleaned)

In [10]:
# Use the function standardize_min_max to standardize the train and pred datasets using a min max scaler and save them as .csv files in the folder data/interim 
# These datasets can be used for the lasso regression model, because reggression is sensitive to scaling 
train_df_sdd, pred_df_sdd = standardize_dataset (train_processed, pred_processed)

In [11]:
# Code checks if retrain_models = True or False in config.py file. If using your own datasets, change retrain_models in the config.py file to True, so the models are trained on your own data. 
# Warning: training the models can take a long time depending on the size and contents of your data
if retrain_models == True:
    print ('Training models on the data...')
    best_rf_model = randomforestregressormodel_train(train_processed)
    best_lasso_model = lassoregressionmodel_train(train_df_sdd)
    best_svm_model = supportvectormachinemodel_train(train_df_sdd)
else:
    print('retrain_models is False in the config.py file, laoding the the pre-trained models')
    
# Folds = number of train/test splits of the dataset, candidates = models with different parameters and fits = folds * candidates

Training models on the data...
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Fitting 5 folds for each of 199 candidates, totalling 995 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [12]:
# Import code that loads the trained models and that can predict on the dataset
from module.modeling.predict import *

In [13]:
# Use the loaded models to predict on the datasets. The lasso model uses the standardized dataset ot predict an, but takes the student numnbers from the 
# regular predict dataset. 
ranked_students_rf = randomforestregressormodel_pred (pred_processed)
ranked_students_lasso = lassoregressionmodel_pred(pred_df_sdd, pred_processed)
ranked_students_svm = supportvectormachinemodel_pred(pred_df_sdd, pred_processed)

In [14]:
if save_method == 'xlsx':
    # Save results as excel file in the folder predictions. Predictions is in the models folder.
    writer = pd.ExcelWriter('models/predictions/ranked_students.xlsx', engine='xlsxwriter')
    ranked_students_rf.to_excel(writer, sheet_name='Random Forest', startrow=0, startcol=0, index=False)
    ranked_students_lasso.to_excel(writer, sheet_name='Lasso', startrow=0, startcol=0, index=False)
    ranked_students_svm.to_excel(writer, sheet_name='Support Vector Machine', startrow=0, startcol=0, index=False)
    writer.close()
    print ('Output file saved as .xlsx in the /models/predictions folder')
elif save_method == 'csv':
    # Save results as CSV files
    ranked_students_rf.to_csv('models/predictions/csv_output/ranked_students_rf.csv', sep='\t', index=False)
    ranked_students_lasso.to_csv('models/predictions/csv_output/ranked_students_lasso.csv', sep='\t', index=False)
    ranked_students_svm.to_csv('models/predictions/csv_output/ranked_students_svm.csv', sep='\t', index=False)
    print ('Output file saved as .csv in the /models/predictions/csv_output folder'')
else:
    print('Invalid save method. Please choose "xlsx" or "csv".')

Output file saved as .xlsx
