In [1]:
import pandas as pd

df = pd.read_csv('../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Exp_Segment,Exp_Level
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.383705,0.383705
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.986085,0.986085
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.91927,0.91927
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.91927,0.91927
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.986085,0.986085


In [2]:
y = df['School_Codes']
x = df['HomeZone'].values.reshape(-1, 1)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef

rf = RandomForestClassifier(n_estimators=100)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy\t", rf.score(X_test, y_test)) # Acc
print("PRF1 Mac\t", precision_recall_fscore_support(y_test, y_pred, average = 'macro')[:3]) # Rec = Bal Acc
print("Matthews\t", matthews_corrcoef(y_test, y_pred))

Accuracy	 0.4618008185538881
PRF1 Mac	 (0.3298179218601658, 0.31203719837539395, 0.31125276119032685)
Matthews	 0.24631802368475555


In [4]:
from statistics import stdev

def average(l):
    return sum(l) / len(l)

schools = list(rf.classes_)
acc, prec, rec, f_1, mcc, apo = [], [], [], [], [], []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)        
    acc.append(rf.score(X_test, y_test))
    p, r, f = precision_recall_fscore_support(y_test, y_pred, average = 'macro')[:3]
    prec.append(p)
    rec.append(r)
    f_1.append(f)
    mcc.append(matthews_corrcoef(y_test, y_pred))
    probs = rf.predict_proba(X_test)
    results = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(probs)), axis=1)
    apo.append(results.apply(lambda z: z[schools.index(z.School_Codes)], axis=1).mean())

results_df = {}
for (name, var) in (("Accuracy", acc), ("Prec Mac", prec), ("Rec  Mac", rec), ("F-1  Mac", f_1), ("Matthews", mcc), ("Ave Prob", apo)):
    results_df[name] = (average(var), stdev(var))
results_df = pd.DataFrame.from_dict(results_df, orient = 'index', columns = ['Stat', 'St_Dev'])
results_df

Unnamed: 0,Stat,St_Dev
Accuracy,0.456253,0.002731
Prec Mac,0.328069,0.015447
Rec Mac,0.298449,0.002481
F-1 Mac,0.302476,0.005184
Matthews,0.238984,0.003843
Ave Prob,0.38388,0.002886


In [5]:
full_zones_df = pd.read_csv('../Data/Full_Zones_Dists.csv')
full_zones_df['RF_Prediction'] = rf.predict(full_zones_df['Origin'].values.reshape(-1, 1))
print(full_zones_df['RF_Prediction'].value_counts())
full_zones_df.head()

SG    873
YK    613
RY    498
MI    163
SC    138
OC     67
YG     40
Name: RF_Prediction, dtype: int64


Unnamed: 0,Origin,Dist.SG,Dist.SC,Dist.MI,Dist.YK,Dist.YG,Dist.RY,Dist.OC,RF_Prediction
0,1,8.233101,17.76699,27.43587,24.33012,11.45088,6.40715,7.172228,RY
1,2,7.629588,18.88647,26.70272,23.72661,11.6074,5.773918,6.420753,RY
2,3,6.983658,18.75429,26.05679,23.08068,10.7994,5.127988,5.774823,RY
3,4,6.303398,19.60028,25.34593,22.45217,11.28469,4.417128,5.063963,RY
4,5,5.788057,20.27234,24.83059,21.93682,11.30364,3.901787,4.548622,RY


In [8]:
full_zones_df.to_csv('../Data/SMTO_2015/RF_Predictions.csv', index=False)