# Testing ML Algorithms using the Multilabel Target Matrix

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import  make_pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ParameterGrid

from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, make_scorer, confusion_matrix

from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier

In [2]:
df = pd.read_csv('../data/multiclass_target_raw_data.csv')
df.head()

Unnamed: 0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,D37,D38,D39,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best,Best Heuristic
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.73872,0.073308,0.18797,1,0,0,0,0,0,0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.74436,0.067669,0.18797,0,1,0,0,0,0,1
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.74248,0.069549,0.18797,1,0,0,0,0,0,0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7312,0.080827,0.18797,1,0,0,0,0,0,0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.73308,0.078947,0.18797,1,0,0,0,0,0,0


In [3]:
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)

X = df.loc[:,features.index]
X.head()

Feature Type and Number,S1,S2,S3,S4,S6,S7,S8,S9,S10,S11,...,D30,D31,D32,D33,D34,D35,D36,D37,D38,D39
0,0.83307,0.99682,0.83307,0.76789,0.76948,0.069952,0.16057,6,1.2734,6,...,0.0,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797
1,0.83307,0.99682,0.83307,0.76948,0.77107,0.068363,0.16057,6,1.2734,6,...,0.0,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797
2,0.83307,0.99682,0.83307,0.76789,0.76948,0.069952,0.16057,6,1.2734,6,...,0.0,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797
3,0.83307,0.99682,0.83307,0.76789,0.76948,0.069952,0.16057,6,1.2734,6,...,0.0,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797
4,0.83307,0.99682,0.83307,0.76789,0.76948,0.069952,0.16057,6,1.2734,6,...,0.0,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797


In [4]:
H_Best = ['H0_Best', 'H1_Best', 'H2_Best', 'H3_Best', 'H4_Best', 'H5_Best']


y = df[H_Best]

0    0
1    1
2    0
3    0
4    0
Name: Best Heuristic, dtype: int64

In [5]:
y.head()

Unnamed: 0,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0


In [6]:
types = dict.fromkeys(features.index, 'stdsc')
mm_feats = ['S1','S3', 'S4', 'S6', 'S8','S11', 'S12', 'D3', 'D39']

for feat in mm_feats:
    types[feat] = 'minmax'

In [7]:
minmax_feats = []
std_feats = []

for feat in types.keys():
    if types[feat] == 'stdsc':
        std_feats.append(feat)
    elif types[feat] == 'minmax':
        minmax_feats.append(feat)

print('MinMax Scaler Features: ', minmax_feats)
print('Standard Scaler Features: ', std_feats)

preprocessor = ColumnTransformer(
    transformers=[
        ('mm_scaler', MinMaxScaler(), minmax_feats),
        ('std_scaler', StandardScaler(), std_feats)])

MinMax Scaler Features:  ['S1', 'S3', 'S4', 'S6', 'S8', 'S11', 'S12', 'D3', 'D39']
Standard Scaler Features:  ['S2', 'S7', 'S9', 'S10', 'S13', 'S14', 'D1', 'D2', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D18', 'D19', 'D20', 'D22', 'D23', 'D24', 'D25', 'D26', 'D27', 'D28', 'D29', 'D30', 'D31', 'D32', 'D33', 'D34', 'D35', 'D36', 'D37', 'D38']


In [8]:
def bridge_mcc(ytrue, ypred):
    TN, FP, FN, TP = confusion_matrix(ytrue, ypred)
    MCC_numer = (TP * TN) - (FP * FN)
    if TN == 0:
        MCC_denom = 1
    elif FN == 0:
        MCC_denom = 1
    elif FP == 0:
        MCC_denom = 1
    elif TP == 0:
        MCC_denom = 1
    else:
        MCC_denom = np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    return MCC_numer / MCC_denom
    

In [9]:
def ML_pipeline_KFold_acc_f1_SVC(X, y, preprocessor, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set. It employs Support Vector
    Classification and gives results for accuracy,f1, and Matthews Correlation
    Coefficient scores.
    """
    
    test_scores = {'f1': [], 'acc': [], 'mcc':[] }
    best_models = []

    for i in range(1,11):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        svc = SVC(probability=False, random_state=random_state)
        clf = make_pipeline(preprocessor, svc)
        grid = GridSearchCV(clf, param_grid=param_grid, scoring='f1', cv=kf, verbose=1, n_jobs=-2, pre_dispatch=6)   
        grid.fit(X_other, y_other)
        print(i, grid.best_params_)
        best_models.append(grid.best_params_)
        y_pred = grid.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        print("scores (acc, f1, mcc)",  acc, f1, mcc)
        test_scores['acc'].append(acc)
        test_scores['f1'].append(f1)
        test_scores['mcc'].append(mcc)
    return best_models, test_scores

## SVC

#### H1

In [10]:
param_grid = {'svc__C': np.logspace(-10, 5, base=2, num=16),'svc__gamma': np.logspace(-5, 10, base=2, num=16)}
pg = ParameterGrid(param_grid)

best_models, test_scores = ML_pipeline_KFold_acc_f1_SVC(X, y['H1_Best'], preprocessor, param_grid)

Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  2.9min finished


1 {'svc__C': 32.0, 'svc__gamma': 0.5}
scores (acc, f1, mcc) 0.8120915032679739 0.3783783783783784 0.28802280125064256
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  2.9min finished


2 {'svc__C': 32.0, 'svc__gamma': 4.0}
scores (acc, f1, mcc) 0.8341503267973857 0.3829787234042553 0.31877935285649545
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  2.8min finished


3 {'svc__C': 32.0, 'svc__gamma': 1.0}
scores (acc, f1, mcc) 0.8300653594771242 0.3953488372093023 0.3099117194932223
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   57.7s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  2.8min finished


4 {'svc__C': 32.0, 'svc__gamma': 0.5}
scores (acc, f1, mcc) 0.829248366013072 0.37611940298507457 0.300473187122996
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  3.1min finished


5 {'svc__C': 32.0, 'svc__gamma': 0.5}
scores (acc, f1, mcc) 0.815359477124183 0.3651685393258427 0.282238545719523
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  3.0min finished


6 {'svc__C': 32.0, 'svc__gamma': 1.0}
scores (acc, f1, mcc) 0.8455882352941176 0.43243243243243246 0.36921096305695006
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  2.7min finished


7 {'svc__C': 32.0, 'svc__gamma': 0.25}
scores (acc, f1, mcc) 0.8161764705882353 0.3835616438356164 0.30669081373500573
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  2.8min finished


8 {'svc__C': 32.0, 'svc__gamma': 2.0}
scores (acc, f1, mcc) 0.8349673202614379 0.39156626506024095 0.3270944053566954
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  3.1min finished


9 {'svc__C': 16.0, 'svc__gamma': 2.0}
scores (acc, f1, mcc) 0.8251633986928104 0.35928143712574856 0.2945485403013772
Fitting 4 folds for each of 256 candidates, totalling 1024 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done 1024 out of 1024 | elapsed:  3.0min finished


10 {'svc__C': 32.0, 'svc__gamma': 1.0}
scores (acc, f1, mcc) 0.829248366013072 0.364741641337386 0.2854364422591711


#### H2

In [None]:
param_grid = {'svc__C': np.logspace(-10, 5, base=2, num=16), 'svc__gamma': np.logspace(-5, 10, base=2, num=16)}
pg = ParameterGrid(param_grid)

best_models, test_scores = ML_pipeline_KFold_acc_f1_SVC(X, y['H2_Best'], preprocessor, param_grid)

#### H3

In [None]:
param_grid = {'svc__C': np.logspace(-10, 5, base=2, num=16), 'svc__gamma': np.logspace(-5, 10, base=2, num=16)}
pg = ParameterGrid(param_grid)

best_models, test_scores = ML_pipeline_KFold_acc_f1_SVC(X, y['H3_Best'], preprocessor, param_grid)

#### H4

In [None]:
param_grid = {'svc__C': np.logspace(-10, 5, base=2, num=16), 'svc__gamma': np.logspace(-5, 10, base=2, num=16)}
pg = ParameterGrid(param_grid)

best_models, test_scores = ML_pipeline_KFold_acc_f1_SVC(X, y['H4_Best'], preprocessor, param_grid)

#### H5

In [None]:
param_grid = {'svc__C': np.logspace(-10, 5, base=2, num=16), 'svc__gamma': np.logspace(-5, 10, base=2, num=16)}
pg = ParameterGrid(param_grid)

best_models, test_scores = ML_pipeline_KFold_acc_f1_SVC(X, y['H5_Best'], preprocessor, param_grid)

#### H0

In [None]:
param_grid = {'svc__C': np.logspace(-10, 5, base=2, num=16), 'svc__gamma': np.logspace(-5, 10, base=2, num=16)}
pg = ParameterGrid(param_grid)

best_models, test_scores = ML_pipeline_KFold_acc_f1_SVC(X, y['H0_Best'], preprocessor, param_grid)