In [2]:
import pandas as pd

# Load data
df = pd.read_csv('../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Exp_Segment,Exp_Level
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.944738,0.944738
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.986085,0.986085
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.91927,0.91927
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.91927,0.91927
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.986085,0.986085


In [3]:
# Load zone coordinates
zones = pd.read_csv('../Data/Zones.csv')
zones.set_index('Zone#', inplace=True)

# Add zone information to df
temp = pd.DataFrame([[a[i] for a in (zones['PD'], zones['X'], zones['Y'])] for i in df['HomeZone']], columns=['PD', 'X', 'Y'], index=df.index)
df = pd.concat((df, temp), axis=1)

# Normalize from 0 to 1
df['X'] = (df['X'] - df['X'].min()) / (df['X'].max() - df['X'].min())
df['Y'] = (df['Y'] - df['Y'].min()) / (df['Y'].max() - df['Y'].min())

In [4]:
# Machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef

In [5]:
std_dists = df.iloc[:, 17:24]
three_dists = df.iloc[:,18:21] # YK, SC, MI
coords = df[['X', 'Y']]

In [5]:
enrol_df = pd.read_csv('../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
class_weights = {}
codes = df['School_Codes'].unique()
for code in codes:
    class_weights[code] = enrol_df['Total'][code] / enrol_df['Total'].sum() / df['School_Codes'].value_counts(normalize=True)[code]

class_weights

{'SC': 1.0219476558677114,
 'SG': 0.8554355398311676,
 'MI': 1.3954352295859767,
 'OC': 0.7172963503635428,
 'RY': 0.9714540357019726,
 'YK': 1.2477727180170286,
 'YG': 0.7377180964673961}

In [9]:
# Prepare results dataframe
metric_names = ['Acc', 'Prec', 'Rec', 'F1', 'MCC', 'APO']
results = pd.DataFrame(columns=['Model'] + metric_names)

In [7]:
def run_trial(X, name, weighting = None):
    """
    Run Random Forest model on passed X
    and add row to results.
    weighting: None (no weighting), 'class' (weights by class), 'sample' (weights by sample)
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    if weighting == 'class':
        name += 'CW'
        rf_weights.fit(X_train, y_train)
        y_pred = rf_weights.predict(X_test)
        metrics_list = [rf_weights.score(X_test, y_test)]
        schools = list(rf_weights.classes_)
        probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(rf_weights.predict_proba(X_test))), axis=1)
    else:
        name += 'SW' if weighting == 'sample' else ''
        rf.fit(X_train, y_train, sample_weight=df['Exp_Segment'].loc[X_train.index] if weighting == 'sample' else None)
        schools = list(rf.classes_)
        probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(rf.predict_proba(X_test))), axis=1)
        y_pred = rf.predict(X_test)
        metrics_list = [rf.score(X_test, y_test)]

    metrics_list.extend(precision_recall_fscore_support(y_test, y_pred, average = 'macro')[:3])
    metrics_list.append(matthews_corrcoef(y_test, y_pred))
    metrics_list.append(probs.apply(lambda z: z[schools.index(z.School_Codes)], axis=1).mean())
    results.loc[len(results)] = [name] + metrics_list

# Prepare classifier
rf = RandomForestClassifier(n_estimators=100)
rf_weights = RandomForestClassifier(n_estimators=100, class_weight=class_weights)
y = df['School_Codes']
num_trials = 10

# Run model trials
for x_temp, name in ((std_dists, 'Seven'), (three_dists, 'Three'), (coords, 'Coord')):
    for i in range(num_trials):     
        run_trial(x_temp, name)
        run_trial(x_temp, name, 'sample')
        run_trial(x_temp, name, 'class')

In [8]:
# results.groupby('Model').mean().to_csv('WeightModelResults.csv')
results.groupby('Model').mean().sort_values('APO', ascending=False)

Unnamed: 0_level_0,Acc,Prec,Rec,F1,MCC,APO
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ThreeSW,0.463487,0.37836,0.311262,0.310993,0.252523,0.392087
CoordSW,0.462001,0.365458,0.306595,0.305377,0.24748,0.390317
SevenSW,0.46467,0.369415,0.312383,0.310979,0.253734,0.390314
Coord,0.463951,0.345841,0.303543,0.308997,0.246096,0.388624
Seven,0.460051,0.341842,0.300624,0.306181,0.24091,0.386571
ThreeCW,0.460678,0.353807,0.31552,0.311513,0.254365,0.385705
Three,0.458705,0.336919,0.304254,0.306588,0.242909,0.385594
CoordCW,0.46026,0.352774,0.315556,0.309677,0.256858,0.384856
SevenCW,0.4565,0.355403,0.312153,0.307202,0.249607,0.384239


In [9]:
results.corr()

Unnamed: 0,Acc,Prec,Rec,F1,MCC,APO
Acc,1.0,0.325753,0.460535,0.586661,0.77183,0.789868
Prec,0.325753,1.0,0.504477,0.56441,0.397884,0.372199
Rec,0.460535,0.504477,1.0,0.790316,0.824445,0.364297
F1,0.586661,0.56441,0.790316,1.0,0.656351,0.472689
MCC,0.77183,0.397884,0.824445,0.656351,1.0,0.541774
APO,0.789868,0.372199,0.364297,0.472689,0.541774,1.0


In [8]:
class_weights = {}
codes = df['School_Codes'].unique()
for code in codes:
    class_weights[code] = 1 / df['School_Codes'].value_counts(normalize=True)[code] / 7
class_weights

{'SC': 1.9132462686567167,
 'SG': 0.3495228357191547,
 'MI': 2.3122886133032696,
 'OC': 4.527593818984548,
 'RY': 0.7601927353595255,
 'YK': 0.6682958618442489,
 'YG': 6.616129032258065}

In [15]:
# Prepare classifier
rf_weights = RandomForestClassifier(n_estimators=100, class_weight=class_weights)
y = df['School_Codes']
num_trials = 10

# Run model trials
for x_temp, name in ((std_dists, 'Seven'), (three_dists, 'Three'), (coords, 'Coord')):
    for i in range(num_trials):
        X_train, X_test, y_train, y_test = train_test_split(x_temp, y, test_size=0.3)    
        rf_weights.fit(X_train, y_train)
        y_pred = rf_weights.predict(X_test)
        metrics_list = [rf_weights.score(X_test, y_test)]
        schools = list(rf_weights.classes_)
        probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(rf_weights.predict_proba(X_test))), axis=1)
        metrics_list.extend(precision_recall_fscore_support(y_test, y_pred, average = 'macro')[:3])
        metrics_list.append(matthews_corrcoef(y_test, y_pred))
        metrics_list.append(probs.apply(lambda z: z[schools.index(z.School_Codes)], axis=1).mean())
        results.loc[len(results)] = [name] + metrics_list
results.head()                       

Unnamed: 0,Model,Acc,Prec,Rec,F1,MCC,APO
0,Seven,0.348189,0.293105,0.348324,0.285012,0.215212,0.311873
1,Seven,0.358403,0.29728,0.357177,0.289505,0.225609,0.322645
2,Seven,0.345636,0.29378,0.353648,0.284168,0.213017,0.312543
3,Seven,0.345636,0.290573,0.33356,0.281964,0.209173,0.31088
4,Seven,0.361885,0.30542,0.349614,0.29808,0.229176,0.313482


In [16]:
# results.groupby('Model').mean().to_csv('WeightModelResults.csv')
results.groupby('Model').mean().sort_values('APO', ascending=False)

Unnamed: 0_level_0,Acc,Prec,Rec,F1,MCC,APO
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Three,0.357892,0.300731,0.353358,0.293995,0.224,0.31788
Coord,0.353621,0.296388,0.348863,0.289593,0.219442,0.317162
Seven,0.355919,0.298271,0.351192,0.291161,0.222429,0.315696
