In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import  Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler#,  MinMaxScaler
from sklearn.metrics import f1_score,  precision_score, recall_score

from skmultilearn.problem_transform import LabelPowerset

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [2]:
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)
print('number of features:', len(features.index))

number of features: 51


In [3]:
target = ['H0_Best','H1_Best', 'H2_Best', 'H3_Best', 'H4_Best', 'H5_Best']

df = pd.read_csv('../data/multilabel_with_tol_raw_data.csv')
X = df.loc[:, features.index]
y = df[target]
y[:14] # Display the target columns

Unnamed: 0,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best
0,1,0,0,0,0,0
1,0,1,1,0,1,1
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
5,1,0,0,0,0,0
6,1,0,0,0,0,0
7,1,0,0,0,0,0
8,1,0,0,0,0,0
9,1,0,0,0,0,0


#### Use standard scaler for all features

In [4]:
def run_pipeline(X, y, param_grid):
    test_scores = []
    best_models = []
    for i in range(10):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        pipe = Pipeline([
            ("preprocessor", StandardScaler()), 
            ("clf", LabelPowerset())
        ])
        grid = GridSearchCV(pipe, param_grid, scoring="f1_micro", cv=kf, return_train_score=True, verbose=5, n_jobs=-1)   
        grid.fit(X_other, y_other)
        y_pred = grid.predict(X_test)
        prec = precision_score(y_test, y_pred, average="micro")
        rec = recall_score(y_test, y_pred, average="micro")
        f1 = f1_score(y_test, y_pred, average="micro")
        print("Best Params: {} \nBest CV Score: {}".format(grid.best_params_, np.round(grid.best_score_, decimals=6)))
        print("Precision:     {}\nRecall:        {}\nf1_micro:      {} \n".format(np.round(prec, decimals=6), np.round(rec, decimals=6),np.round(f1, decimals=6)))
        test_scores.append(f1)
        best_models.append(grid)
    return test_scores, best_models

## Dummy Classifier (Baseline Test Score)

In [5]:
param_grid = { 'clf': [DummyClassifier()],
               'clf__strategy': ["stratified", "most_frequent", "prior", "uniform"]}
test_scores_dummy, best_models_dummy = run_pipeline(X, y, param_grid)
amax = np.argmax(test_scores_dummy)
print('best test score for DummyClassifier:', test_scores_dummy[amax])
print('best params for DummyClassifier:', best_models_dummy[amax].best_params_)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.347165
Precision:     0.263344
Recall:        0.498454
f1_micro:      0.344619 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.339811
Precision:     0.268977
Recall:        0.493939
f1_micro:      0.348291 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.348484
Precision:     0.258274
Recall:        0.475521
f1_micro:      0.334739 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf': DummyClassifier(strategy='uniform'), 'clf__strategy': 'uniform'} 
Best CV Score: 0.341585
Precision:     0.267697
Recall:        0.504086
f1_micro:      0.34969 

Fitting 4 folds for e

In [6]:
print('mean test score for Dummy Classifier:', np.mean(test_scores_dummy) )
print('stdev of test score for Dummy Classifier:', np.std(test_scores_dummy))

mean test score for Dummy Classifier: 0.3459407160833628
stdev of test score for Dummy Classifier: 0.007281207461362988


## Random Forest Classifier

In [7]:
param_grid = {  'clf': [RandomForestClassifier()],
                'clf__max_depth': [15, 20, 25, 30, 35, 40],
                'clf__max_features': [15, 20, 25, 30, 35, 40, 45, 51]} 

test_scores_RF, best_models_RF = run_pipeline(X, y, param_grid)


Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=20, max_features=30), 'clf__max_depth': 20, 'clf__max_features': 30} 
Best CV Score: 0.65392
Precision:     0.755472
Recall:        0.62268
f1_micro:      0.682679 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=20, max_features=45), 'clf__max_depth': 20, 'clf__max_features': 45} 
Best CV Score: 0.643509
Precision:     0.776081
Recall:        0.616162
f1_micro:      0.686937 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=25, max_features=40), 'clf__max_depth': 25, 'clf__max_features': 40} 
Best CV Score: 0.654363
Precision:     0.759695
Recall:        0.622396
f1_micro:      0.684226 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf': RandomForestClassifier(max_depth=20, max_features=40), 'clf__max_de

In [8]:
amax = np.argmax(test_scores_RF)
print(amax)
print('best test score for Random Forest:', test_scores_RF[amax])
print('best estimator for Random Forest:', best_models_RF[amax].best_params_)

6
best test score for Random Forest: 0.6958538706871558
best estimator for Random Forest: {'clf': RandomForestClassifier(max_depth=35, max_features=30), 'clf__max_depth': 40, 'clf__max_features': 35}


In [9]:
print('mean test score for Random Forest:', np.mean(test_scores_RF) )
print('stdev of test score for Random Forest:', np.std(test_scores_RF))

mean test score for Random Forest: 0.6824581364808062
stdev of test score for Random Forest: 0.00672101219853508


In [10]:
output = open('../results/RF_models_micro.pkl', 'wb')
pickle.dump(best_models_RF, output)
output.close()

## K Nearest Neighbors Classifier

In [11]:
param_grid = {'clf': [KNeighborsClassifier()], 
              'clf__n_neighbors': [5,10,15,20,30,40,50,60],
              'clf__weights': ['distance', 'uniform'] }

test_scores_KNN, best_models_KNN = run_pipeline(X, y, param_grid)
print(np.mean(test_scores_KNN), '+/-', np.std(test_scores_KNN))

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best CV Score: 0.63347
Precision:     0.712023
Recall:        0.625773
f1_micro:      0.666118 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best CV Score: 0.620691
Precision:     0.723368
Recall:        0.637879
f1_micro:      0.677939 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best CV Score: 0.63561
Precision:     0.709937
Recall:        0.64375
f1_micro:      0.675225 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'} 
Best 

In [12]:
amax = np.argmax(test_scores_KNN)
print(amax)
print('best test score for K Nearest Neighbors:', test_scores_KNN[amax])
print('best params for K Nearest Neighbors:', best_models_KNN[amax].best_params_)

6
best test score for K Nearest Neighbors: 0.682196339434276
best params for K Nearest Neighbors: {'clf': KNeighborsClassifier(weights='distance'), 'clf__n_neighbors': 5, 'clf__weights': 'distance'}


In [13]:
print('mean test score for K Nearest Neighbors:', np.mean(test_scores_KNN) )
print('stdev of test score for K Nearest Neighbors:', np.std(test_scores_KNN))

mean test score for K Nearest Neighbors: 0.6665431208875722
stdev of test score for K Nearest Neighbors: 0.010111567762256213


In [14]:
output = open('../results/KNN_models_micro.pkl', 'wb')
pickle.dump(best_models_KNN, output)
output.close()

## SVC

In [15]:
param_grid = { 'clf': [OneVsRestClassifier(SVC(max_iter=100000000, cache_size=3000))],
               'clf__estimator__C': np.logspace(1,4,num=4),
               'clf__estimator__tol': np.logspace(-4, -2,num=3) } 

test_scores_SVC, best_models_SVC = run_pipeline(X, y, param_grid)

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Params: {'clf': OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=3000,
                                  max_iter=100000000, tol=0.0001)), 'clf__estimator__C': 10000.0, 'clf__estimator__tol': 0.0001} 
Best CV Score: 0.595619
Precision:     0.638696
Recall:        0.615979
f1_micro:      0.627132 

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Params: {'clf': OneVsRestClassifier(estimator=SVC(C=1000.0, cache_size=3000, max_iter=100000000,
                                  tol=0.0001)), 'clf__estimator__C': 1000.0, 'clf__estimator__tol': 0.0001} 
Best CV Score: 0.585681
Precision:     0.65916
Recall:        0.54697
f1_micro:      0.597847 

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Params: {'clf': OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=3000,
                                  max_iter=100000000, tol=0.0001)), 'clf__estimator__C': 10000.0, 'clf__estimator__t

In [16]:
amax = np.argmax(test_scores_SVC)
print(amax)
print('best test score for SVC OneVsRest:', test_scores_SVC[amax])
print('best estimator for SVC OneVsRest:', best_models_SVC[amax].best_params_)

6
best test score for SVC OneVsRest: 0.651063829787234
best estimator for SVC OneVsRest: {'clf': OneVsRestClassifier(estimator=SVC(C=10000.0, cache_size=3000,
                                  max_iter=100000000, tol=0.0001)), 'clf__estimator__C': 10000.0, 'clf__estimator__tol': 0.01}


In [17]:
print('mean test score for SVC OneVsRest:', np.mean(test_scores_SVC) )
print('stdev of test score for SVC OneVsRest:', np.std(test_scores_SVC))

mean test score for SVC OneVsRest: 0.6261541090946127
stdev of test score for SVC OneVsRest: 0.013580985225053556


In [18]:
output = open('../results/SVC_models_micro.pkl', 'wb')
pickle.dump(best_models_SVC, output)
output.close()

## Logistic Regression

In [19]:
param_grid = { 'clf': [OneVsRestClassifier(LogisticRegression(max_iter=100000, 
                                                              warm_start=True, 
                                                              multi_class='ovr'))],
               'clf__estimator__penalty': ['l2'],
               'clf__estimator__C': np.logspace(0, 4, 5),
               'clf__estimator__solver': ['sag', 'lbfgs'] }

test_scores_LR, best_models_LR = run_pipeline(X, y, param_grid)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best Params: {'clf': OneVsRestClassifier(estimator=LogisticRegression(C=100.0, max_iter=100000,
                                                 multi_class='ovr',
                                                 solver='sag',
                                                 warm_start=True)), 'clf__estimator__C': 100.0, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'sag'} 
Best CV Score: 0.391132
Precision:     0.655172
Recall:        0.284021
f1_micro:      0.39626 

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best Params: {'clf': OneVsRestClassifier(estimator=LogisticRegression(C=10000.0, max_iter=100000,
                                                 multi_class='ovr',
                                                 warm_start=True)), 'clf__estimator__C': 10000.0, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'lbfgs'} 
Best CV Score: 0.406434
Precision:     0.616484
Recall:   

In [20]:
amax = np.argmax(test_scores_LR)
print(amax)
print('best test score for Log Reg:', test_scores_LR[amax])
print('best params for Log Reg:', best_models_LR[amax].best_params_)

6
best test score for Log Reg: 0.41894353369763204
best params for Log Reg: {'clf': OneVsRestClassifier(estimator=LogisticRegression(C=10000.0, max_iter=100000,
                                                 multi_class='ovr',
                                                 warm_start=True)), 'clf__estimator__C': 10000.0, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'lbfgs'}


In [21]:
print('mean test score for Log Reg:', np.mean(test_scores_LR) )
print('stdev of test score for Log Reg:', np.std(test_scores_LR))

mean test score for Log Reg: 0.394490805768927
stdev of test score for Log Reg: 0.016411141321304855


In [22]:
output = open('../results/LR_models_micro.pkl', 'wb')
pickle.dump(best_models_LR, output)
output.close()