# 2019 SMTO Uni/College Choice RF

Load the full 2019 SMTO dataframe:

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import statistics as stats

full_df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')
full_df.head()

Unnamed: 0,Liv_Arr,Children,Cars,Income,Home_Zone,School_Name,Campus,Work,Licence,Mode,...,Dist.SHH,Dist.SHT,Dist.MI,Dist.SC,Dist.SG,Dist.YK,Dist.YG,Dist.RY,Dist.OC,PD
0,Live with family/parents,0.0,0.0,,3851.0,Centennial College,Progress Campus,NW,False,Transit,...,4.821538,16.18253,19.03615,54.90925,32.74982,40.58064,43.9828,33.22763,31.52583,36
1,Live with family/parents,0.0,1.0,,181.0,Centennial College,Morningside Campus,NW,False,Transit,...,43.01722,51.23784,22.93553,22.04248,8.990107,9.21471,8.279897,10.30155,10.30233,4
2,Live with family/parents,0.0,2.0,I don't know,1039.0,Centennial College,Progress Campus,NW,False,Transit,...,71.35209,81.92638,54.53982,13.64408,35.73098,40.61661,29.41865,35.05555,36.44957,20
3,,,,,191.0,Centennial College,Progress Campus,NW,False,,...,40.2008,48.42141,20.66308,21.85741,5.659206,11.77978,8.35746,6.970653,6.971434,4
4,Live with family/parents,1.0,1.0,,600.0,Centennial College,Progress Campus,NW,False,,...,61.53779,69.75841,41.28113,11.49956,25.83839,18.81172,15.84128,25.16296,26.82406,16


Transform the complex income variable into a High/Low/Unknown variable:

In [19]:
def income_var(x):
    if x == 'Less than $ 14,999':
        return 'Low'
    elif x == '$ 15,000 - 29,999':
        return 'Low'
    elif x == '$ 30,000 - 39,999':
        return 'Low'
    elif x == '$ 40,000 - 49,999':
        return 'Low'
    elif x == '$ 50,000 - 59,999':
        return 'Low'
    elif x == '$ 60,000 - 69,999':
        return 'Low'
    elif x == '$ 70,000 - 79,999':
        return 'Low'
    elif x == '$ 80,000 - 89,999':
        return 'Low'
    elif x == '$ 90,000 - 99,999':
        return 'High'
    elif x == '$ 90,000 - 99,999':
        return 'High'
    elif x == '$ 100,000 - 124,999':
        return 'High'
    elif x == '$ 125,000 - 149,999':
        return 'High'
    elif x == '$ 150,000 - 199,999':
        return 'High'
    elif x == '$ 200,000 +':
        return 'High'
    else:
        return 'Unknown'
    
full_df['Income'] = full_df['Income'].apply(lambda x: income_var(x))

Split our dataframe into a training set (75% of the original dataframe) and a testing set. For now we will just work with our new training set called `df`.

In [16]:
# Split 2019 data into initial training and testing
df = full_df.sample(frac=0.75, random_state=0)
test_df = full_df.drop(df.index)
print(df.columns)

Index(['Liv_Arr', 'Children', 'Cars', 'Income', 'Home_Zone', 'School_Name',
       'Campus', 'Work', 'Licence', 'Mode', 'Age', 'Faculty', 'School_Type',
       'Family', 'Level', 'Status', 'Campus_Zone', 'School', 'Dist.CST',
       'Dist.CAS', 'Dist.CPR', 'Dist.CMO', 'Dist.CDV', 'Dist.CEG', 'Dist.CPI',
       'Dist.CDS', 'Dist.DOS', 'Dist.DWH', 'Dist.MCM', 'Dist.MCB', 'Dist.MOF',
       'Dist.MOS', 'Dist.MOI', 'Dist.OTD', 'Dist.OTN', 'Dist.SHD', 'Dist.SHH',
       'Dist.SHT', 'Dist.MI', 'Dist.SC', 'Dist.SG', 'Dist.YK', 'Dist.YG',
       'Dist.RY', 'Dist.OC', 'PD'],
      dtype='object')


In [17]:
school_codes = df['School'].unique().tolist()
uni_codes = df[df['School_Type'] == 'University']['School'].unique().tolist()

#### Variables:

In [18]:
x = 0
y = 0

for (df,x,y) in [(df, x, y), (test_df, test_x, test_y)]:

    df['Closest'] = df[['Dist.' + code for code in school_codes]].idxmin(axis = 1)
    df['Closest_Type'] = df['Closest'].str.split('.').apply(lambda x: 1 if x[1] in uni_codes else 0)
    df['Closest_College'] = (df['Closest_Type'] == 0) & (df['Family'] == False)
    df['Closest_Uni'] = (df['Closest_Type'] == 1) & (df['Family'] == False)

    # Filter df with our needed variables
    variable_list = ['PD', 'Age', 'Cars', 'Family', 'Income', 'Closest_Type', 'Licence']
    df = pd.concat((df['School_Type'], df[variable_list],), axis = 1)

    # Define X and Y
    y = df['School_Type']
    x = df[variable_list]

    # Make boolean variables
    x = pd.concat((x, pd.get_dummies(x['Income'])), axis = 1)
    del x['Income']
    del x['Unknown']
    x['Family'].fillna('Unknown', inplace = True)
    x = pd.concat((x, pd.get_dummies(x['Family'])), axis = 1)
    del x['Family']
    del x['Unknown']
    x['Cars2+'] = (x['Cars'] >= 2)*1
    x['Cars1'] = (x['Cars'] == 1)*1
    x['Cars0'] = (x['Cars'] == 0)*1
    del x['Cars']
    x['Licence_True']  = (x['Licence'] == 1)
    x['Licence_False']  = (x['Licence'] == 0)
    del x['Licence']

    # Make X & Y the same length
    x = x.dropna()
    y = y.loc[x.index]

NameError: name 'x' is not defined

In [None]:
f1s = []
CMs = []

for i in range(10):

    # --- RUN Random Forest Model --- #
    rf = RandomForestClassifier(n_estimators=100)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # --- Feature Importances ---
    if i == 0:
        features = pd.DataFrame(index = X_test.columns)
        features['FeatImportance.0'] = rf.feature_importances_
        #features.sort_values(by='FeatImportance' , inplace=True, ascending = False)
    else:
        temp_features = pd.DataFrame(index = X_test.columns)
        temp_features['FeatImportance.' + str(i)] = rf.feature_importances_
        features = pd.concat((features, temp_features), axis = 1)


    # --- Confusion Matrix ---
    CM = pd.DataFrame(confusion_matrix(y_test, y_pred))
    CM.rename(columns = {0: 'Pred_Col', 1: 'Pred_Uni'}, index = {0: 'Obs_Col', 1: 'Obs_Uni'}, inplace = True)
    CMs.append(CM)

    # --- College F-1 Score ---
    f1s.append(f1_score(y_test, y_pred, average=None)[0])

In [None]:
print("Average College F-1 Score: \t" + str(stats.mean(f1s)))
print("MAX College F-1 Score: \t\t" + str(max(f1s)))

In [None]:
index_max = f1s.index(max(f1s))
print("\nCONFUSION MATRIX for best F-1 Score:")
pd.DataFrame(CMs[index_max])

In [None]:
print('\nFeature Imporance Rank for best F-1 Score:')
col_name = 'FeatImportance.' + str(index_max)
best_feat = pd.DataFrame(features['FeatImportance.' + str(index_max)]).rename(columns = {col_name:'Feature Importance'})
best_feat.sort_values(by = 'Feature Importance', inplace=True, ascending = False)
best_feat

In [None]:
pd.DataFrame(rf.predict_proba(X_test))

Testing with separate tesing set:

In [None]:
'''
test_df['Closest'] = test_df[['Dist.' + code for code in school_codes]].idxmin(axis = 1)
test_df['Closest_Type'] = test_df['Closest'].str.split('.').apply(lambda x: 1 if x[1] in uni_codes else 0)
test_df['Closest_College'] = (test_df['Closest_Type'] == 0) & (test_df['Family'] == False)
test_df['Closest_Uni'] = (test_df['Closest_Type'] == 1) & (test_df['Family'] == False)

# Filter df with our needed variables
variable_list = ['PD', 'Age', 'Cars', 'Family', 'Income', 'Closest_Type', 'Licence']
test_df = pd.concat((test_df['School_Type'], test_df[variable_list],), axis = 1)

# Define X and Y
test_y = test_df['School_Type']
test_x = test_df[variable_list]

# Make boolean variables
test_x = pd.concat((test_x, pd.get_dummies(test_x['Income'])), axis = 1)
del test_x['Income']
del test_x['Unknown']
test_x['Family'].fillna('Unknown', inplace = True)
test_x = pd.concat((test_x, pd.get_dummies(test_x['Family'])), axis = 1)
del test_x['Family']
del test_x['Unknown']
test_x['Cars2+'] = (test_x['Cars'] >= 2)*1
test_x['Cars1'] = (test_x['Cars'] == 1)*1
test_x['Cars0'] = (test_x['Cars'] == 0)*1
del test_x['Cars']
test_x['Licence_True']  = (test_x['Licence'] == 1)
test_x['Licence_False']  = (test_x['Licence'] == 0)
del test_x['Licence']

# Make X & Y the same length
test_x = test_x.dropna()
test_y = test_y.loc[test_x.index]
'''
# Run Model on Testing Data
y_pred_new = rf.predict(test_x)

In [None]:
# --- Confusion Matrix ---
test_CM = pd.DataFrame(confusion_matrix(test_y, y_pred_new))
test_CM.rename(columns = {0: 'Pred_Col', 1: 'Pred_Uni'}, index = {0: 'Obs_Col', 1: 'Obs_Uni'}, inplace = True)
test_CM

In [None]:
# --- College F-1 Score ---
print(f1_score(test_y, y_pred_new, average=None)[0])