In [1]:
num_closests = 4
num_work = 2

In [2]:
import pandas as pd
df = pd.read_csv('Training.csv')
codes = df['School'].unique().tolist()
y_train = df['School']
df = df[['Closest.' + c for c in codes] + ['PD', 'Level', 'Status', 'Work', 'Income', 'Family']].copy()

if num_closests == 4:
    df['Closest.DT'] = (df['Closest.SG']) | (df['Closest.SG']) | (df['Closest.OC'])
    
df['LevelGrad'] = (df['Level'] == 'Grad').astype(int)
df['StatusPT'] = (df['Status'] == 'PT').astype(int)

if num_work == 3:
    df['WorkNW'] = df['Work'].str.startswith('No').astype(int)
    df['WorkPT'] = df['Work'].str.startswith('Yes, I work part time').astype(int)
    df['WorkFT'] = (df['Work'].str.startswith('Yes') & (~ df['WorkPT'])).astype(int)
elif num_work == 2:
    df['WorkNo'] = df['Work'].apply(lambda x: x.startswith('No')).astype(int)
    df['WorkYes'] = df['Work'].apply(lambda x: x.startswith('Yes')).astype(int)

df['IncomeHigh'] = (df['Income'] == 'High').astype(int)
df['IncomeLow'] = (df['Income'] == 'Low').astype(int)

df = df.select_dtypes(['number'])
if num_closests == 4:
    df = df.drop(columns = ['Closest.SG', 'Closest.RY', 'Closest.OC', 'Closest.YG'])
x_train = df.values

In [3]:
test_df = pd.read_csv('Testing.csv')
y_test = test_df['School']
test_df = test_df[['Closest.' + c for c in codes] + ['PD', 'Level', 'Status', 'Work', 'Income', 'Family']].copy()

if num_closests == 4:
    test_df['Closest.DT'] = (test_df['Closest.SG']) | (test_df['Closest.SG']) | (test_df['Closest.OC'])

test_df['LevelGrad'] = (test_df['Level'] == 'Grad').astype(int)
test_df['StatusPT'] = (test_df['Status'] == 'PT').astype(int)

if num_work == 3:
    test_df['WorkNW'] = test_df['Work'].str.startswith('No').astype(int)
    test_df['WorkPT'] = test_df['Work'].str.startswith('Yes, I work part time').astype(int)
    test_df['WorkFT'] = (test_df['Work'].str.startswith('Yes') & (~ test_df['WorkPT'])).astype(int)
elif num_work == 2:
    test_df['WorkNo'] = test_df['Work'].str.startswith('No').astype(int)
    test_df['WorkYes'] = test_df['Work'].str.startswith('Yes').astype(int)

test_df['IncomeHigh'] = (test_df['Income'] == 'High').astype(int)
test_df['IncomeLow'] = (test_df['Income'] == 'Low').astype(int)

test_df = test_df.select_dtypes(['number'])
if num_closests == 4:
    test_df = test_df.drop(columns = ['Closest.SG', 'Closest.RY', 'Closest.OC', 'Closest.YG'])
x_test = test_df.values

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()

acc, apo, in_acc, in_apo = [], [], [], []
feat_importances = pd.DataFrame(index=df.columns)

for i in range(10):
    rf.fit(x_train, y_train)
    acc.append(rf.score(x_test, y_test))
    
    schools = list(rf.classes_)
    y_pred = rf.predict_proba(x_test)
    probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(y_pred)), axis=1)
    apo.append(probs.apply(lambda z: z[schools.index(z.School)], axis=1).mean())
    
    in_acc.append(accuracy_score(y_train, rf.predict(x_train)))
    y_train_pred = rf.predict_proba(x_train)
    probs = pd.concat((y_train.reset_index(drop=True), pd.DataFrame(y_train_pred)), axis=1)
    in_apo.append(probs.apply(lambda z: z[schools.index(z.School)], axis=1).mean())

    feat_importances[i] = rf.feature_importances_

In [5]:
def average(l):
    return sum(l) / len(l)

print("OOS Acc: {:.2f}%".format(average(acc) * 100))
print("OOS APO: {:.2f}%".format(average(apo) * 100))
print("I-S Acc: {:.2f}%".format(average(in_acc) * 100))
print("I-S APO: {:.2f}%".format(average(in_apo) * 100))
feat_importances.transpose().describe().transpose()[['mean', 'std']].sort_values('mean', ascending=False)

OOS Acc: 48.28%
OOS APO: 39.97%
I-S Acc: 58.07%
I-S APO: 46.70%


Unnamed: 0,mean,std
PD,0.423892,0.004202
LevelGrad,0.096611,0.001413
Closest.DT,0.081531,0.007551
Family,0.080593,0.001377
Closest.YK,0.059911,0.004067
StatusPT,0.039679,0.001057
WorkYes,0.038429,0.001244
IncomeLow,0.038117,0.000707
IncomeHigh,0.038053,0.000814
Closest.MI,0.037595,0.002598


In [6]:
from sklearn.metrics import make_scorer

def get_apo(y, probs):
    probs = pd.DataFrame(probs)
    probs['School'] = pd.Series(y.values)
    return probs.apply(lambda z: z[schools.index(z.School)], axis=1).mean()
   
apo_scorer = make_scorer(get_apo, needs_proba=True)
acc_scorer = make_scorer(accuracy_score)

In [7]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'n_estimators': [x for x in range(1, 101)],
               'max_features': ['auto', 'log2', 0.3],
               'max_depth': [x for x in range(1, 16)] + [None],
               'min_samples_split': [x for x in range(2, 22, 2)],
               'min_samples_leaf': [x for x in range(1, 21)],
               'bootstrap': [True, False]}

rf_random = RandomizedSearchCV(rf, random_grid, n_iter=100, scoring={'APO': apo_scorer, 'Acc': acc_scorer}, n_jobs=-1, verbose=2, refit='APO')
rf_random.fit(x_train, y_train)
rf_random.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.4min finished


{'n_estimators': 80,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 13,
 'bootstrap': False}

In [8]:
from sklearn.metrics import confusion_matrix
best_rf = rf_random.best_estimator_
probs = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(best_rf.predict_proba(x_test))), axis=1)
probs.groupby('School').sum()

Unnamed: 0_level_0,0,1,2,3,4,5,6
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MI,80.52201,5.8365,48.379726,4.288244,54.343679,5.486008,51.143833
OC,7.263246,5.182295,26.77756,7.828081,44.784613,3.532918,19.631287
RY,45.066466,22.619692,162.737678,46.693939,222.063727,16.442547,121.375951
SC,4.776785,8.288745,57.452509,84.652368,58.269184,7.442962,55.117447
SG,52.168037,43.544369,241.737281,67.53154,830.445264,26.834526,238.738982
YG,3.930376,1.812199,14.813333,7.088326,19.387612,5.782899,17.185255
YK,43.950219,19.235811,138.158955,49.18271,227.453977,20.666634,272.351693


In [9]:
probs['HardPred'] = probs[range(7)].idxmax(axis=1)
pd.DataFrame(confusion_matrix(probs['School'], probs['HardPred'].apply(lambda x: schools[x])))

Unnamed: 0,0,1,2,3,4,5,6
0,130,0,33,0,44,0,43
1,14,0,27,7,51,3,13
2,64,1,165,49,243,7,108
3,4,1,68,108,48,0,47
4,63,2,176,43,1032,16,169
5,3,0,13,9,24,6,15
6,52,1,115,42,251,1,309
