The input file of this notebook is the transformed data from previous notebook.

In [1]:
in_path = "../data/processed/data_model_noss.pkl"
out_model_path = "../model/"
out_eval_path = "../data/evaluation/"

In [2]:
from os.path import dirname
import os, sys, inspect

currentdir = os.getcwd()
parentdir = dirname(currentdir)

sys.path.insert(0,parentdir)

# Libs
Importing libraries.

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from src.utils import dump_to_pickle
from src.evaluate import create_eval_df, plot_feature_importance

In [5]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Load Data

In [6]:
data = pd.read_pickle(in_path)

In [7]:
data.head()

Unnamed: 0,hs_eng,hs_math,hs_bio,hs_chem,hs_phy,hs_econ,hs_geo,hs_soc,hs_final,major_name,school_prop,school_geo_unit,school_state,faculty,fail
0,73.25,70.75,0.0,0.0,0.0,86.5,73.75,79.25,30.8,14,4,6,33,0,0
1,77.75,64.75,0.0,0.0,0.0,79.25,80.0,76.25,25.95,14,4,0,7,0,1
2,70.25,66.75,0.0,0.0,0.0,79.5,77.5,82.25,27.4,14,4,7,21,0,0
3,82.25,85.0,0.0,0.0,0.0,71.75,77.75,72.75,28.4,14,4,0,9,0,0
4,85.25,78.0,80.25,75.5,78.5,0.0,0.0,0.0,33.9,14,5,5,27,0,0


For simplicity in the front-end, we minimize the amount of data that user needs to input. So school features are dropped.

In [8]:
data = data.drop(['school_prop',
                  'school_geo_unit',
                  'school_state',
                  'faculty'], axis=1)

# Train-Test Split
The data is splitted 67% for train set and the rest 33% for test set.

In [9]:
X = data.drop('fail', axis=1)
y = data['fail']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=187)

In [11]:
print("Train Size: ", X_train.shape[0])
print("Test Size: ", X_test.shape[0])

Train Size:  1407
Test Size:  694


# Imbalance Resampling

The class size is very imbalanced (label 0: label 1 = 90:10). We use over-sampling technique to deal with this problem. `SMOTE` is used to upsample the train set so that the proportion between negative class and positive class is roughly 50:50 (balanced).

In [12]:
y_train.value_counts(normalize=True)

0    0.900498
1    0.099502
Name: fail, dtype: float64

In [13]:
target_ratio = 0.5

sm = SMOTE(random_state=187,
           ratio={
               0:y_train.value_counts()[0],
               1:int(y_train.value_counts()[0]*(target_ratio/(1-target_ratio)))
           })

In [14]:
X_train_upsampled, y_train_upsampled = sm.fit_sample(X_train, y_train)

In [15]:
sum(y_train_upsampled)/len(y_train_upsampled)

0.5

After everything is ready, the data is then trained using 3 different classifiers. The trained data is used to predict both train set and test set and the result is then saved to be evaluated later.
# Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
log_model = LogisticRegression(random_state=100).fit(X_train_upsampled, y_train_upsampled)

#save model
dump_to_pickle(log_model, out_model_path+'log_model_noss.pkl')

In [18]:
log_train_pred = log_model.predict_proba(X_train)
log_test_pred = log_model.predict_proba(X_test)

In [19]:
log_eval_train, log_eval_test, log_eval_data = create_eval_df(log_train_pred, log_test_pred, y_train, y_test)

#save eval data
log_save_objs = log_eval_data, log_eval_train, log_eval_test
dump_to_pickle(log_save_objs, out_eval_path+'log_eval_noss.pkl')

# Random Forest 

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
RF_model = RandomForestClassifier(max_depth=3, random_state=100).fit(X_train_upsampled, y_train_upsampled)

#save model
dump_to_pickle(RF_model, out_model_path+'RF_model_noss.pkl')

In [22]:
RF_train_pred = RF_model.predict_proba(X_train)
RF_test_pred = RF_model.predict_proba(X_test)

In [23]:
RF_eval_train, RF_eval_test, RF_eval_data = create_eval_df(RF_train_pred, RF_test_pred, y_train, y_test)

#save eval data
RF_save_objs = RF_eval_data, RF_eval_train, RF_eval_test
dump_to_pickle(RF_save_objs, out_eval_path+'RF_eval_noss.pkl')

# Support Vector Machines

In [24]:
from sklearn.svm import SVC

In [25]:
SVC_model = SVC(probability=True, random_state=100).fit(X_train_upsampled, y_train_upsampled)

#save model
dump_to_pickle(SVC_model, out_model_path+'SVC_model_noss.pkl')

In [26]:
SVC_train_pred = SVC_model.predict_proba(X_train)
SVC_test_pred = SVC_model.predict_proba(X_test)

In [27]:
SVC_eval_train, SVC_eval_test, SVC_eval_data = create_eval_df(SVC_train_pred, SVC_test_pred, y_train, y_test)

#save eval data
SVC_save_objs = SVC_eval_data, SVC_eval_train, SVC_eval_test
dump_to_pickle(SVC_save_objs, out_eval_path+'SVC_eval_noss.pkl')