In [1]:
import pandas as pd

# Load data
df = pd.read_csv('../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Exp_Segment,Exp_Level,Time.Active,Time.Auto,Time.Transit
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.83,0.817,0.817,0.84,0.824,0.949782,0.949782,223.2147,17.47422,75.468478
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.83,0.817,0.817,0.84,0.824,0.98952,0.98952,16.985265,2.924953,24.128386
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.83,0.817,0.817,0.84,0.824,0.919307,0.919307,349.78845,50.17188,155.55117
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.83,0.817,0.817,0.84,0.824,0.919307,0.919307,10.49121,1.049121,16.675728
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.83,0.817,0.817,0.84,0.824,0.98952,0.98952,16.985265,2.924953,24.128386


In [2]:
# Load zone coordinates
zones = pd.read_csv('../Data/Zones.csv')
zones.set_index('Zone#', inplace=True)

# Add zone information to df
temp = pd.DataFrame([[a[i] for a in (zones['PD'], zones['X'], zones['Y'])] for i in df['HomeZone']], columns=['PD', 'X', 'Y'], index=df.index)
df = pd.concat((df, temp), axis=1)

# Normalize from 0 to 1
df['X'] = (df['X'] - df['X'].min()) / (df['X'].max() - df['X'].min())
df['Y'] = (df['Y'] - df['Y'].min()) / (df['Y'].max() - df['Y'].min())

In [3]:
# Machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef

In [4]:
# Prepare results dataframe
metric_names = ['Acc', 'Prec', 'Rec', 'F1', 'MCC', 'APO']
results = pd.DataFrame(columns=['Model'] + metric_names)

In [7]:
def run_trial(X, name):
    """
    Run Random Forest model on passed X
    and add row to results.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    rf.fit(X_train, y_train, sample_weight=df['Exp_Segment'].loc[X_train.index])
    schools = list(rf.classes_)
    probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(rf.predict_proba(X_test))), axis=1)
    y_pred = rf.predict(X_test)
    metrics_list = [rf.score(X_test, y_test)]

    metrics_list.extend(precision_recall_fscore_support(y_test, y_pred, average = 'macro')[:3])
    metrics_list.append(matthews_corrcoef(y_test, y_pred))
    metrics_list.append(probs.apply(lambda z: z[schools.index(z.School_Codes)], axis=1).mean())
    results.loc[len(results)] = [name] + metrics_list

In [8]:
temp = df[df['Family'] == 1]
std_dists = temp.iloc[:, 17:24]
three_dists = temp.iloc[:,18:21] # YK, SC, MI
coords = temp[['X', 'Y']]

# Prepare classifier
rf = RandomForestClassifier(n_estimators=60, min_samples_split=14, min_samples_leaf=12, max_depth=8)
y = temp['School_Codes']
num_trials = 10

# Run model trials
for x_temp, name in ((std_dists, 'FSeven'), (three_dists, 'FThree'), (coords, 'FCoord')):
    for i in range(num_trials):     
        run_trial(x_temp, name)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
temp = df[df['Family'] == 0]
std_dists = temp.iloc[:, 17:24]
three_dists = temp.iloc[:,18:21] # YK, SC, MI
coords = temp[['X', 'Y']]
y = temp['School_Codes']

# Run model trials
for x_temp, name in ((std_dists, 'NSeven'), (three_dists, 'NThree'), (coords, 'NCoord')):
    for i in range(num_trials):     
        run_trial(x_temp, name)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# results.groupby('Model').mean().to_csv('WeightModelResults.csv')
results.groupby('Model').mean().sort_values('APO', ascending=False)

Unnamed: 0_level_0,Acc,Prec,Rec,F1,MCC,APO
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NCoord,0.674814,0.621611,0.449217,0.482429,0.443636,0.548184
NSeven,0.676358,0.58821,0.447608,0.477144,0.446866,0.543445
NThree,0.67311,0.606185,0.439763,0.466236,0.439103,0.537364
FCoord,0.342782,0.223404,0.25106,0.219928,0.155046,0.264216
FSeven,0.340633,0.223306,0.248129,0.219119,0.151582,0.262377
FThree,0.338321,0.224132,0.245602,0.216004,0.148322,0.260989
