In [212]:
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import  Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler#,  MinMaxScaler
from sklearn.metrics import f1_score,  precision_score, recall_score

from skmultilearn.problem_transform import LabelPowerset

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [2]:
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)
print('number of features:', len(features.index))

number of features: 51


In [3]:
target = ['H0_Best','H1_Best', 'H2_Best', 'H3_Best', 'H4_Best', 'H5_Best']

df = pd.read_csv('../data/multilabel_with_tol_raw_data.csv')
X = df.loc[:, features.index]
y = df[target]
y[:14] # Display the target columns

Unnamed: 0,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best
0,1,0,0,0,0,0
1,0,1,1,0,1,1
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
5,1,0,0,0,0,0
6,1,0,0,0,0,0
7,1,0,0,0,0,0
8,1,0,0,0,0,0
9,1,0,0,0,0,0


#### Use standard scaler for all features

In [182]:
def run_pipeline(X, y, param_grid):
    test_scores = []
    best_models = []
    for i in range(10):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        pipe = Pipeline([
            ("preprocessor", StandardScaler()), 
            ("clf", LabelPowerset())
        ])
        grid = GridSearchCV(pipe, param_grid, scoring="f1_macro", cv=kf, return_train_score=True, verbose=5, n_jobs=-1)   
        grid.fit(X_other, y_other)
        y_pred = grid.predict(X_test)
        prec = precision_score(y_test, y_pred, average="macro")
        rec = recall_score(y_test, y_pred, average="macro")
        f1 = f1_score(y_test, y_pred, average="macro")
        print("Best Params: {} \nBest CV Score: {}".format(grid.best_params_, np.round(grid.best_score_, decimals=6)))
        print("Precision:     {}\nRecall:        {}\nf1_macro:      {} \n".format(np.round(prec, decimals=6), np.round(rec, decimals=6),np.round(f1, decimals=6)))
        test_scores.append(f1)
        best_models.append(grid)
    return test_scores, best_models

## Dummy Classifier (Baseline Test Score)

In [231]:
param_grid = { 'clf': [DummyClassifier()],
               'clf__strategy': ["stratified", "most_frequent", "prior", "uniform"]}
test_scores_dummy, best_models_dummy = run_pipeline(X, y, param_grid)
amax = np.argmax(test_scores_dummy)
print('best test score for DummyClassifier:', test_scores_dummy[amax])
print('best params for DummyClassifier:', best_models_dummy[amax].best_params_)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.334622
Precision:     0.259592
Recall:        0.493342
f1_macro:      0.332589 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.344314
Precision:     0.265085
Recall:        0.492021
f1_macro:      0.339907 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.341416
Precision:     0.261375
Recall:        0.502809
f1_macro:      0.336687 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.338613
Precision:     0.258885
Recall:        0.480728
f1_macro:      0.32873 

Fitting 4 folds for e

In [232]:
print('mean test score for Dummy Classifier:', np.mean(test_scores_dummy) )
print('stdev of test score for Dummy Classifier:', np.std(test_scores_dummy))

mean test score for Dummy Classifier: 0.3387659627865264
stdev of test score for Dummy Classifier: 0.005772852122808203


## Random Forest Classifier

In [184]:
param_grid = {  'clf': [RandomForestClassifier()],
                'clf__max_depth': [15, 20, 25, 30, 35, 40],
                'clf__max_features': [15, 20, 25, 30, 35, 40, 45, 51]} 

test_scores_RF, best_models_RF = run_pipeline(X, y, param_grid)


Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=25, max_features=51), 'clf__max_depth': 25, 'clf__max_features': 51} 
Best CV Score: 0.628332
Precision:     0.727805
Recall:        0.587221
f1_macro:      0.648404 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=20, max_features=40), 'clf__max_depth': 20, 'clf__max_features': 40} 
Best CV Score: 0.616639
Precision:     0.769184
Recall:        0.596332
f1_macro:      0.669665 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=25, max_features=40), 'clf__max_depth': 25, 'clf__max_features': 40} 
Best CV Score: 0.632131
Precision:     0.75608
Recall:        0.593899
f1_macro:      0.664078 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=20, max_features=35), 'clf__max_d

In [187]:
amax = np.argmax(test_scores_RF)
print(amax)
print('best test score for Random Forest:', test_scores_RF[amax])
print('best estimator for Random Forest:', best_models_RF[amax].best_estimator_)

6
best test score for Random Forest: 0.6699192253680097
best estimator for Random Forest: Pipeline(steps=[('preprocessor', StandardScaler()),
                ('clf', RandomForestClassifier(max_depth=20, max_features=30))])


In [190]:
print('mean test score for Random Forest:', np.mean(test_scores_RF) )
print('stdev of test score for Random Forest:', np.std(test_scores_RF))

mean test score for Random Forest: 0.6589939589620999
stdev of test score for Random Forest: 0.008006302029955137


In [188]:
output = open('../results/RF_models.pkl', 'wb')
pickle.dump(best_models_RF, output)
output.close()

## K Nearest Neighbors Classifier

In [199]:
param_grid = { 'clf': [KNeighborsClassifier()], 
              'clf__n_neighbors': [5,10,15,20,30,40,50,60],
              'clf__weights': ['distance', 'uniform'] }

test_scores_KNN, best_models_KNN = run_pipeline(X, y, param_grid)
print(np.mean(test_scores_KNN), '+/-', np.std(test_scores_KNN))

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best CV Score: 0.610185
Precision:     0.687288
Recall:        0.598151
f1_macro:      0.638792 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best CV Score: 0.594188
Precision:     0.709327
Recall:        0.616575
f1_macro:      0.658185 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best CV Score: 0.613553
Precision:     0.692465
Recall:        0.625319
f1_macro:      0.656351 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Be

In [196]:
param_grid = { 'clf': [KNeighborsClassifier(n_neighbors=10)], 
              'clf__weights': ['distance', 'uniform'] }
test_scores_KNN10, best_models_KNN10 = run_pipeline(X, y, param_grid)
print('10 neighbors:', np.mean(test_scores_KNN10), '+/-', np.std(test_scores_KNN10))

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=10, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.591343
Precision:     0.695219
Recall:        0.580763
f1_macro:      0.631302 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=10, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.58705
Precision:     0.719589
Recall:        0.583805
f1_macro:      0.641638 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=10, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.599535
Precision:     0.704278
Recall:        0.585656
f1_macro:      0.638129 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=10, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.599016
Precision:     

In [197]:
param_grid = { 'clf': [KNeighborsClassifier(n_neighbors=15)], 
              'clf__weights': ['distance', 'uniform'] }
test_scores_KNN15, best_models_KNN15 = run_pipeline(X, y, param_grid)
print(np.mean(test_scores_KNN15), '+/-', np.std(test_scores_KNN15))

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=15, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.578681
Precision:     0.708524
Recall:        0.564354
f1_macro:      0.626444 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=15, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.572108
Precision:     0.741642
Recall:        0.557458
f1_macro:      0.632977 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=15, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.582457
Precision:     0.715097
Recall:        0.560442
f1_macro:      0.626273 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=15, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.589762
Precision:    

In [198]:
param_grid = { 'clf': [KNeighborsClassifier(n_neighbors=20)], 
              'clf__weights': ['distance', 'uniform'] }
test_scores_KNN20, best_models_KNN20 = run_pipeline(X, y, param_grid)
print(np.mean(test_scores_KNN20), '+/-', np.std(test_scores_KNN20))

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=20, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.570874
Precision:     0.70647
Recall:        0.545462
f1_macro:      0.613173 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=20, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.560396
Precision:     0.740963
Recall:        0.527689
f1_macro:      0.612081 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=20, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.574151
Precision:     0.719137
Recall:        0.539372
f1_macro:      0.614305 

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Best Params: {'clf': KNeighborsClassifier(n_neighbors=20, weights='distance'), 'clf__weights': 'distance'} 
Best CV Score: 0.5745
Precision:     0.

In [200]:
amax = np.argmax(test_scores_KNN)
print(amax)
print('best test score for K Nearest Neighbors:', test_scores_KNN[amax])
print('best params for K Nearest Neighbors:', best_models_KNN[amax].best_params_)

1
best test score for K Nearest Neighbors: 0.6581848364381645
best params for K Nearest Neighbors: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'}


In [201]:
print('mean test score for K Nearest Neighbors:', np.mean(test_scores_KNN) )
print('stdev of test score for K Nearest Neighbors:', np.std(test_scores_KNN))

mean test score for K Nearest Neighbors: 0.644529201979756
stdev of test score for K Nearest Neighbors: 0.010817215496306126


In [202]:
output = open('../results/KNN_models.pkl', 'wb')
pickle.dump(best_models_KNN, output)
output.close()

## SVC

In [218]:
param_grid = { 'clf': [OneVsRestClassifier(SVC(max_iter=100000000, cache_size=2500))],
               'clf__estimator__C': np.logspace(1,4,num=4),
               'clf__estimator__tol': np.logspace(-5, -2,num=4) } 

test_scores_SVC2, best_models_SVC2 = run_pipeline(X, y, param_grid)

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=2500,
                                  max_iter=100000000, tol=1e-05)), 'clf__estimator__C': 10000.0, 'clf__estimator__tol': 1e-05} 
Best CV Score: 0.572574
Precision:     0.609571
Recall:        0.588136
f1_macro:      0.597934 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=2500,
                                  max_iter=100000000, tol=0.01)), 'clf__estimator__C': 10000.0, 'clf__estimator__tol': 0.01} 
Best CV Score: 0.556578
Precision:     0.624243
Recall:        0.575447
f1_macro:      0.598169 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=2500,
                                  max_iter=100000000, tol=1e-05)), 'clf__estimator__C': 10000.0, 'clf__estimator__tol'

In [233]:
amax = np.argmax(test_scores_SVC2)
print(amax)
print('best test score for SVC OneVsRest:', test_scores_SVC2[amax])
print('best estimator for SVC OneVsRest:', best_models_SVC2[amax].best_estimator_)

6
best test score for SVC OneVsRest: 0.6264945831565095
best estimator for SVC OneVsRest: Pipeline(steps=[('preprocessor', StandardScaler()),
                ('clf',
                 OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=2500,
                                                   max_iter=100000000,
                                                   tol=0.01)))])


In [220]:
print('mean test score for SVC OneVsRest:', np.mean(test_scores_SVC2) )
print('stdev of test score for SVC OneVsRest:', np.std(test_scores_SVC2))

mean test score for SVC OneVsRest: 0.6051097369774612
stdev of test score for SVC OneVsRest: 0.009637405421532371


In [221]:
output = open('../results/SVC_models.pkl', 'wb')
pickle.dump(best_models_SVC2, output)
output.close()

## Logistic Regression

In [226]:
param_grid = { 'clf': [OneVsRestClassifier(LogisticRegression(max_iter=100000, 
                                                              warm_start=True, 
                                                              multi_class='ovr'))],
               'clf__estimator__penalty': ['l2'],
               'clf__estimator__C': np.logspace(0, 4, 5),
               'clf__estimator__solver': ['sag', 'lbfgs'] }

test_scores_LR, best_models_LR = run_pipeline(X, y, param_grid)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best Params: {'clf': OneVsRestClassifier(estimator=LogisticRegression(C=100.0, max_iter=100000,
                                                 multi_class='ovr',
                                                 solver='sag',
                                                 warm_start=True)), 'clf__estimator__C': 100.0, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'sag'} 
Best CV Score: 0.332316
Precision:     0.623545
Recall:        0.23896
f1_macro:      0.327559 

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best Params: {'clf': OneVsRestClassifier(estimator=LogisticRegression(C=10000.0, max_iter=100000,
                                                 multi_class='ovr',
                                                 warm_start=True)), 'clf__estimator__C': 10000.0, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'lbfgs'} 
Best CV Score: 0.341218
Precision:     0.601185
Recall:   

In [227]:
amax = np.argmax(test_scores_LR)
print(amax)
print('best test score for SVC OneVsRest:', test_scores_LR[amax])
print('best params for SVC OneVsRest:', best_models_LR[amax].best_params_)

6
best test score for SVC OneVsRest: 0.35368633020207857
best params for SVC OneVsRest: {'clf': OneVsRestClassifier(estimator=LogisticRegression(C=10000.0, max_iter=100000,
                                                 multi_class='ovr',
                                                 warm_start=True)), 'clf__estimator__C': 10000.0, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'lbfgs'}


In [228]:
print('mean test score for Log Reg:', np.mean(test_scores_LR) )
print('stdev of test score for Log Reg:', np.std(test_scores_LR))

mean test score for Log Reg: 0.3327902727423884
stdev of test score for Log Reg: 0.016916962676296738


In [229]:
output = open('../results/LR_models.pkl', 'wb')
pickle.dump(best_models_LR, output)
output.close()