# Part 3b: Machine Learning - Original Multi-class (Single Label) Problem

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.multiclass import OneVsRestClassifier

In [2]:
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)

In [3]:
df = pd.read_csv('../data/multiclass_raw_data.csv')
X = df.loc[:,features.index]
y = df['Best Heuristic']

In [4]:
def run_pipeline(X, y, ML_algo, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set and runs GridSearchCV to find
    the best estimator. It returns the test scores and models generated by each
    random state.
    """
    test_scores = []
    best_models = []
    for i in range(10):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        pipe = Pipeline([
            ("preprocessor", StandardScaler()), 
            ("clf", ML_algo)
        ])
        grid = GridSearchCV(pipe, param_grid, scoring="f1_micro", cv=kf, return_train_score=True, verbose=5, n_jobs=-1)   
        grid.fit(X_other, y_other)
        y_pred = grid.predict(X_test)
        prec = precision_score(y_test, y_pred, average="micro")
        rec = recall_score(y_test, y_pred, average="micro")
        f1 = f1_score(y_test, y_pred, average="micro")
        print("Best Params: {} \nBest CV Score: {}".format(grid.best_params_, np.round(grid.best_score_, decimals=6)))
        print("Precision:     {}\nRecall:        {}\nf1_micro:      {} \n".format(np.round(prec, decimals=6), np.round(rec, decimals=6),np.round(f1, decimals=6)))
        test_scores.append(f1)
        best_models.append(grid)
    return test_scores, best_models

In [5]:
ML_algo = DummyClassifier()
param_grid = { 'clf__strategy':  ["stratified", "most_frequent", "prior", "uniform"] }

test_scores_dummy, best_models_dummy = run_pipeline(X, y, ML_algo, param_grid)

amax = np.argmax(test_scores_dummy)
print('best random state index:', amax)
print('max test score for Dummy Classifier:', test_scores_dummy[amax])
print('best params for Dummy Classifier:', best_models_dummy[amax].best_params_)

print('mean test score for Dummy Classifier:', np.mean(test_scores_dummy) )
print('stdev of test score for Dummy Classifier:', np.std(test_scores_dummy))

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf__strategy': 'most_frequent'} 
Best CV Score: 0.411319
Precision:     0.441993
Recall:        0.441993
f1_micro:      0.441993 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf__strategy': 'most_frequent'} 
Best CV Score: 0.424806
Precision:     0.388072
Recall:        0.388072
f1_micro:      0.388072 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf__strategy': 'most_frequent'} 
Best CV Score: 0.41479
Precision:     0.428105
Recall:        0.428105
f1_micro:      0.428105 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf__strategy': 'most_frequent'} 
Best CV Score: 0.414179
Precision:     0.430556
Recall:        0.430556
f1_micro:      0.430556 

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best Params: {'clf__strategy': 'most_frequent'} 
Best CV Score: 0.413978
Precision:     0.431373
Recall:        0

## Random Forest Classifier

In [6]:
ML_algo = RandomForestClassifier()
param_grid = { 'clf__max_depth': [15, 20, 25, 30, 35, 40], 
               'clf__max_features':[15, 20, 25, 30, 35, 40, 45, 51] }

test_scores_RF, best_models_RF = run_pipeline(X, y, ML_algo, param_grid)

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf__max_depth': 15, 'clf__max_features': 15} 
Best CV Score: 0.61177
Precision:     0.633987
Recall:        0.633987
f1_micro:      0.633987 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf__max_depth': 15, 'clf__max_features': 20} 
Best CV Score: 0.608501
Precision:     0.627451
Recall:        0.627451
f1_micro:      0.627451 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf__max_depth': 30, 'clf__max_features': 35} 
Best CV Score: 0.613609
Precision:     0.638889
Recall:        0.638889
f1_micro:      0.638889 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf__max_depth': 20, 'clf__max_features': 25} 
Best CV Score: 0.611358
Precision:     0.621732
Recall:        0.621732
f1_micro:      0.621732 

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Params: {'clf__max_depth': 25, 'clf__max_featu

In [7]:
amax = np.argmax(test_scores_RF)
print('best random state index:', amax)
print('max test score for Random Forest:', test_scores_RF[amax])
print('best params for Random Forest:', best_models_RF[amax].best_params_)

best random state index: 6
max test score for Random Forest: 0.6478758169934641
best params for Random Forest: {'clf__max_depth': 35, 'clf__max_features': 25}


In [8]:
print('mean test score for Random Forest:', np.mean(test_scores_RF) )
print('stdev of test score for Random Forest:', np.std(test_scores_RF))

mean test score for Random Forest: 0.626062091503268
stdev of test score for Random Forest: 0.01097967483262293


## K Neighbors Classifier

In [9]:
ML_algo = KNeighborsClassifier()
param_grid ={ 'clf__n_neighbors': [5,10,15,20,30,40,50,60],
              'clf__weights': ['distance', 'uniform'] }
best_scores_KNN, best_models_KNN = run_pipeline(X, y, ML_algo, param_grid)

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf__n_neighbors': 30, 'clf__weights': 'distance'} 
Best CV Score: 0.576216
Precision:     0.594771
Recall:        0.594771
f1_micro:      0.594771 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf__n_neighbors': 10, 'clf__weights': 'distance'} 
Best CV Score: 0.579077
Precision:     0.589052
Recall:        0.589052
f1_micro:      0.589052 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf__n_neighbors': 20, 'clf__weights': 'distance'} 
Best CV Score: 0.578871
Precision:     0.595588
Recall:        0.595588
f1_micro:      0.595588 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf__n_neighbors': 20, 'clf__weights': 'distance'} 
Best CV Score: 0.583161
Precision:     0.588235
Recall:        0.588235
f1_micro:      0.588235 

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Params: {'clf__n_neighbors': 1

In [10]:
amax = np.argmax(best_scores_KNN)
print('best random state index:', amax)
print('max test score for K Nearest Neighbors:', best_scores_KNN[amax])
print('best params for K Nearest Neighbors:', best_models_KNN[amax].best_params_)

best random state index: 6
max test score for K Nearest Neighbors: 0.6176470588235294
best params for K Nearest Neighbors: {'clf__n_neighbors': 5, 'clf__weights': 'distance'}


In [11]:
print('mean test score for K Nearest Neighbors:', np.mean(best_scores_KNN) )
print('stdev of test score for K Nearest Neighbors:', np.std(best_scores_KNN))

mean test score for K Nearest Neighbors: 0.5973039215686273
stdev of test score for K Nearest Neighbors: 0.009676803891547556


## SVC

In [None]:
ML_algo = OneVsRestClassifier(SVC(max_iter=100000000, cache_size=3000))

param_grid = { 'clf__estimator__C': np.logspace(1,4,num=4),
               'clf__estimator__tol': np.logspace(-4, -2,num=3) } 

test_scores_SVC, best_models_SVC = run_pipeline(X, y, ML_algo, param_grid)

In [None]:
amax = np.argmax(test_scores_SVC)
print(amax)
print('best test score for SVC OneVsRest:', test_scores_SVC[amax])
print('best estimator for SVC OneVsRest:', best_models_SVC[amax].best_params_)

In [None]:
print('mean test score for SVC OneVsRest:', np.mean(test_scores_SVC) )
print('stdev of test score for SVC OneVsRest:', np.std(test_scores_SVC))

## Logistic Regression

In [None]:
ML_algo = OneVsRestClassifier(LogisticRegression(max_iter=100000, warm_start=True, multi_class='ovr'))
param_grid = { 'clf__estimator__penalty': ['l2'],
               'clf__estimator__C': np.logspace(0, 4, 5),
               'clf__estimator__solver': ['sag', 'lbfgs'] }

test_scores_LR, best_models_LR = run_pipeline(X, y, ML_algo, param_grid)

In [None]:
amax = np.argmax(test_scores_LR)
print('best random state index:', amax)
print('max test score for Log Reg:', test_scores_LR[amax])
print('best params for Log Reg:', best_models_LR[amax].best_params_)

In [None]:
print('mean test score for Log Reg:', np.mean(test_scores_LR) )
print('stdev of test score for Log Reg:', np.std(test_scores_LR))