Load data, including normalized zone coordinates.

In [1]:
import pandas as pd

df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']

zones = pd.read_csv('../../Data/Zones.csv').set_index('Zone#')
temp = pd.DataFrame([[a[i] for a in (zones['PD'], zones['X'], zones['Y'])] for i in df['HomeZone']], columns=['PD', 'X', 'Y'], index=df.index)
df = pd.concat((df, temp), axis=1)

df['X'] = (df['X'] - df['X'].min()) / (df['X'].max() - df['X'].min())
df['Y'] = (df['Y'] - df['Y'].min()) / (df['Y'].max() - df['Y'].min())

Prepare training and testing set.

In [2]:
from sklearn.model_selection import train_test_split
std_dists = df.iloc[:, 17:24]
three_dists = df.iloc[:,18:21] # YK, SC, MI
coords = df[['X', 'Y']]

X = pd.concat((std_dists, df.Segment), axis=1)
y = df['School_Codes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Performance metrics calculations.

In [3]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef

def evaluate_model(model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(model.predict_proba(X_test))), axis=1)
    schools = list(model.classes_)
    metrics_list = [model.score(X_test, y_test)]      
    metrics_list.extend(precision_recall_fscore_support(y_test, preds, average = 'macro')[:3])
    metrics_list.append(matthews_corrcoef(y_test, preds))
    metrics_list.append(probs.apply(lambda z: z[schools.index(z.School_Codes)], axis=1).mean())
    return metrics_list

Evaluate base model with default parameters.

In [4]:
from sklearn.ensemble import RandomForestClassifier
metric_names = ['Acc', 'Prec', 'Rec', 'F1', 'MCC', 'APO']
results = pd.DataFrame(columns=['Model'] + metric_names)

base_rf = RandomForestClassifier()
results.loc[len(results)] = ['Base'] + evaluate_model(base_rf)

Hyperparameter tuning.

Code adapted from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74.

In [5]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'n_estimators': [x for x in range(1, 101)],
               'max_features': ['auto', 'log2', 0.3],
               'max_depth': [x for x in range(1, 16)] + [None],
               'min_samples_split': [x for x in range(2, 22, 2)],
               'min_samples_leaf': [x for x in range(1, 21)],
               'bootstrap': [True, False]}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, n_jobs = -1)
rf_random.fit(pd.concat((std_dists, df.Segment), axis=1), df['School_Codes'])
rf_random.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.4min finished


{'n_estimators': 62,
 'min_samples_split': 14,
 'min_samples_leaf': 11,
 'max_features': 0.3,
 'max_depth': 9,
 'bootstrap': False}

Evaluate model suggested by randomized search.

In [6]:
best_rf = rf_random.best_estimator_
results.loc[len(results)] = ['BestRand'] + evaluate_model(best_rf)
results

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,Acc,Prec,Rec,F1,MCC,APO
0,Base,0.476555,0.318752,0.311349,0.308958,0.271625,0.413734
1,BestRand,0.512535,0.419399,0.341073,0.326994,0.315704,0.398808


Refined grid search near recommended parameters.

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 60, 70],
               'max_depth': [7, 8, 9, 10],
               'min_samples_split': [13, 14, 15, 16, 17, 18],
               'min_samples_leaf': [10, 11, 12, 13]}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(pd.concat((std_dists, df.Segment), axis=1), df['School_Codes'])
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.3min finished


{'max_depth': 7,
 'min_samples_leaf': 13,
 'min_samples_split': 14,
 'n_estimators': 60}

Results from recommended model from grid search.

In [8]:
best_rf = grid_search.best_estimator_
results.loc[len(results)] = ['BestGrid'] + evaluate_model(best_rf)
results

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,Acc,Prec,Rec,F1,MCC,APO
0,Base,0.476555,0.318752,0.311349,0.308958,0.271625,0.413734
1,BestRand,0.512535,0.419399,0.341073,0.326994,0.315704,0.398808
2,BestGrid,0.506732,0.458346,0.328039,0.310877,0.302909,0.388046
