# 2019 SMTO Uni/College Choice RF

Load the full 2019 SMTO dataframe:

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import statistics as stats

df = pd.read_csv('2019_SMTO_Training_Set.csv').rename(columns = {'Unnamed: 0': 'Index'})
test_df = pd.read_csv('2019_SMTO_Testing_Set.csv').rename(columns = {'Unnamed: 0': 'Index'})
df.head()

Unnamed: 0,Index,School_Type,PD,Age,Closest_Type,Income_High,Income_Low,Family_True,Family_False,Cars2+,Cars1,Cars0,Licence_True,Licence_False
0,16062,University,31,18.0,1,0,1,1,0,1,0,0,1,0
1,12761,University,8,19.0,0,0,0,1,0,0,0,0,0,1
2,7609,University,33,18.0,1,0,0,0,0,0,0,0,1,0
3,10027,University,1,18.0,1,0,0,0,1,0,0,1,1,0
4,9417,College,35,28.0,0,0,1,0,1,0,0,1,0,1


In [40]:
# Define X and Y for the training set
y = df['School_Type']
x = df.drop(labels = ['School_Type', 'Index'], axis = 1)

f1s = []
CMs = []

for i in range(10):

    # --- RUN Random Forest Model --- #
    rf = RandomForestClassifier(n_estimators=100)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # --- Feature Importances ---
    if i == 0:
        features = pd.DataFrame(index = X_test.columns)
        features['FeatImportance.0'] = rf.feature_importances_
        #features.sort_values(by='FeatImportance' , inplace=True, ascending = False)
    else:
        temp_features = pd.DataFrame(index = X_test.columns)
        temp_features['FeatImportance.' + str(i)] = rf.feature_importances_
        features = pd.concat((features, temp_features), axis = 1)


    # --- Confusion Matrix ---
    CM = pd.DataFrame(confusion_matrix(y_test, y_pred))
    CM.rename(columns = {0: 'Pred_Col', 1: 'Pred_Uni'}, index = {0: 'Obs_Col', 1: 'Obs_Uni'}, inplace = True)
    CMs.append(CM)

    # --- College F-1 Score ---
    f1s.append(f1_score(y_test, y_pred, average=None)[0])

In [41]:
print("Average College F-1 Score: \t" + str(stats.mean(f1s)))
print("MAX College F-1 Score: \t\t" + str(max(f1s)))

Average College F-1 Score: 	0.42509765518495934
MAX College F-1 Score: 		0.45526127415891193


In [42]:
index_max = f1s.index(max(f1s))
print("\nCONFUSION MATRIX for best F-1 Score:")
pd.DataFrame(CMs[index_max])


CONFUSION MATRIX for best F-1 Score:


Unnamed: 0,Pred_Col,Pred_Uni
Obs_Col,318,495
Obs_Uni,266,2724


In [43]:
print('\nFeature Imporance Rank for best F-1 Score:')
col_name = 'FeatImportance.' + str(index_max)
best_feat = pd.DataFrame(features['FeatImportance.' + str(index_max)]).rename(columns = {col_name:'Feature Importance'})
best_feat.sort_values(by = 'Feature Importance', inplace=True, ascending = False)
best_feat


Feature Imporance Rank for best F-1 Score:


Unnamed: 0,Feature Importance
PD,0.440434
Age,0.349482
Closest_Type,0.053848
Income_Low,0.028604
Family_False,0.018091
Family_True,0.018024
Licence_False,0.017399
Cars2+,0.015843
Income_High,0.015817
Cars1,0.014608


In [47]:
# Run Testing Data

# Define X and Y for the training set
test_y = test_df['School_Type']
test_x = test_df.drop(labels = ['School_Type', 'Index'], axis = 1)

# Predict School Type for testing data
y_pred_test = rf.predict(test_x)

# --- Confusion Matrix ---
test_CM = pd.DataFrame(confusion_matrix(test_y, y_pred_test))
test_CM.rename(columns = {0: 'Pred_Col', 1: 'Pred_Uni'}, index = {0: 'Obs_Col', 1: 'Obs_Uni'}, inplace = True)
test_CM

Unnamed: 0,Pred_Col,Pred_Uni
Obs_Col,329,554
Obs_Uni,303,3040


In [48]:
# --- College F-1 Score ---
print(f1_score(test_y, y_pred_test, average=None)[0])

0.43432343234323434
