# Machine Learning with Hyperparameter optimization

In [2]:
# importing the required packages
import numpy as np
import pandas as pd
import scipy.stats as st

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, make_scorer

### Helper functions

In [3]:
def conf_int(num_array):
    """
    Return the 2.5% and 97.5% confidence interval from numpy array as string.
    
    Parameters
    ----------
    num_array : numpy array

    Returns
    -------
    str
        2.5% - 97.5% confidence interval

    """
    ci = st.t.interval(confidence=0.95, df=len(num_array)-1,
                        loc=np.nanmean(num_array),
                        scale=st.sem(num_array, nan_policy='omit'))

    return(f'{ci[0]:.5f}-{ci[1]:.5f}')


def merge_scores(list_with_scores):
    """
    Merge scores from cross-validation into one score dictionary.

    Parameters
    ----------
    list_with_scores : list
        contains two or more crossvalidation scores
    
    Returns
    -------
    dict
        dictionary with merged scores
    """

    for i in range(1,len(list_with_scores)):
        for k in list_with_scores[0].keys():
            list_with_scores[0][k] = np.concatenate((list_with_scores[0][k], list_with_scores[i][k]), axis=0)

    return list_with_scores[0]

### Loading the file with features

In [4]:
dataset = pd.read_csv("../data/features.csv")
X = dataset.iloc[:,4:-1]
y = dataset.iloc[:,-1]

## Preprocessing

In [5]:
# removing the columns with missing values
X = X.dropna(axis=1)
# adding the performed activities and imaging session number as dummy variable
X = pd.concat([X, pd.get_dummies(dataset.iloc[:,3], drop_first=True), pd.get_dummies(dataset.iloc[:,2], prefix="rep_", drop_first=True)], axis=1)

# convert to numpy arrays
X, y = np.array(X), np.array(y)

# Standardization with standard scaler
standard_scaler = StandardScaler()
norm_X = standard_scaler.fit_transform(X)

## Training the machine learning models models

In [6]:
# Constants used for training
NUM_TRIALS = 10
K_FOLD = 10
N_JOBS = -1

In [7]:
# Scoring parameters
scor = {"accuracy": "accuracy",
        "balanced_accuracy": "balanced_accuracy",
        "sensitivity": "recall",
        "specifity": make_scorer(recall_score, pos_label=0),
        "roc_auc": "roc_auc",
        "neg_brier_score": "neg_brier_score"}

### K-Nearest Neighbors

In [9]:
knn = KNeighborsClassifier()

# parameter grid for GridSearchCV
p_grid = {
    'leaf_size': [10, 15, 20, 25],
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

nested_scores = []
count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=knn, param_grid=p_grid, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)

    count += 1
    print(f"KNN Trial {count}/{NUM_TRIALS}")

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["KNN"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["KNN (mean)"]).transpose()], axis=1)
scores_df

KNN Trial 1/10


Unnamed: 0,KNN,KNN (mean)
fit_time,29.05321-33.64335,31.348279
score_time,0.07109-1.30885,0.689971
test_accuracy,0.76954-0.81006,0.7898
test_balanced_accuracy,0.77062-0.81191,0.791267
test_sensitivity,0.77347-0.83704,0.805253
test_specifity,0.75676-0.79780,0.77728
test_roc_auc,0.86899-0.90320,0.886099
test_neg_brier_score,-0.15300--0.13488,-0.143942


### Logistic Regression

In [None]:
lr = LogisticRegression()

# parameter grid for GridSearchCV
p_grid = [{"penalty": ["l1", "l2"],
            "C": [1, 10, 100],
            "solver": ["liblinear"],
            "max_iter": [300],
            }, 
            {"penalty": ["elasticnet"],
           "C": [1, 10, 100],
           "solver": ["saga"],
           "max_iter": [300]}]

nested_scores = []
count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=lr, param_grid=p_grid, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)

    count += 1
    print(f"LR Trial {count}/{NUM_TRIALS}")

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["Logistic Reg"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["Logistic Reg (mean)"]).transpose()], axis=1)
scores_df

### Naive Bayes

In [10]:
nb = GaussianNB()

# parameter grid for GridSearchCV
p_grid = {
    'var_smoothing': [1e-11, 1e-10, 1e-09, 1e-08, 1e-07]
}

nested_scores = []
count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=nb, param_grid=p_grid, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)
    
    count += 1
    print(f"NB Trial {count}/{NUM_TRIALS}")

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["Naive Bayes"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["Naive Bayes (mean)"]).transpose()], axis=1)
scores_df


NB Trial 1/10


Unnamed: 0,Naive Bayes,Naive Bayes (mean)
fit_time,1.86592-1.98506,1.925486
score_time,0.00724-0.04086,0.024051
test_accuracy,0.59874-0.65151,0.625129
test_balanced_accuracy,0.59661-0.65242,0.624515
test_sensitivity,0.53557-0.67446,0.605012
test_specifity,0.59709-0.69095,0.644019
test_roc_auc,0.66145-0.71174,0.686592
test_neg_brier_score,-0.39014--0.33819,-0.364163


### Decision Tree

In [None]:
dt = DecisionTreeClassifier()

# parameter grid for GridSearchCV
p_grid = {
    'criterion': ["gini", "entropy", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [50, 100, None]
}

nested_scores = []
count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=dt, param_grid=p_grid, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)

    count += 1
    print(f"DT Trial {count}/{NUM_TRIALS}")

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["Decision Tree"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["Decision Tree (mean)"]).transpose()], axis=1)
scores_df

### SVM RBF kernel

In [None]:
svm = SVC(probability=True)

# parameter grid for GridSearchCV
p_grid1 = {'C': [1, 10, 100],
           'kernel': ['rbf'],
           'gamma': ['auto', 'scale'],
           'cache_size': [7000]}

nested_scores = []

count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=svm, param_grid=p_grid1, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)
    count += 1
    print(f'SVM RBF Trial: {count}/{NUM_TRIALS}')

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["SVM"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["SVM (mean)"]).transpose()], axis=1)
scores_df

### SVM Polynomial kernel

In [None]:
svm = SVC(probability=True)

# parameter grid for GridSearchCV
p_grid1 = {'C': [1, 10, 100],
           'kernel': ['poly'],
           'coef0': [0.0, 0.2],
           'gamma': ['auto', 'scale'],
           'cache_size': [7000]}

nested_scores = []

count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=svm, param_grid=p_grid1, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)
    count += 1
    print(f'SVM Poly Trial: {count}/{NUM_TRIALS}')

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["SVM"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["SVM (mean)"]).transpose()], axis=1)
scores_df

### SVM Sigmoid kernel

In [None]:
svm = SVC(probability=True)

# parameter grid for GridSearchCV
p_grid1 = {'C': [1, 10, 100],
           'kernel': ['sigmoid'],
           'coef0': [0.0, 0.2],
           'gamma': ['auto', 'scale'],
           'cache_size': [7000]}

nested_scores = []

count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=svm, param_grid=p_grid1, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)
    nested_scores.append(nested_cv)
    count += 1
    print(f'SVM Sig Trial: {count}/{NUM_TRIALS}')

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["SVM"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["SVM (mean)"]).transpose()], axis=1)
scores_df

### Random Forest

In [None]:
rf = RandomForestClassifier()

# parameter grid for GridSearchCV
p_grid = {
    'n_estimators': [1000],
    'criterion': ["gini", "entropy", "log_loss"],
    'max_depth': [50, 100, None],
    'max_features': ["sqrt", "log2"]
}

nested_scores = []

count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=rf, param_grid=p_grid, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring=scor, X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv)

    count += 1
    print(f"RF Trial {count}/{NUM_TRIALS}")
    nested_scores.append(nested_cv)

nested_scores = merge_scores(nested_scores)
score = {k:conf_int(v) for k,v in nested_scores.items()}
mean_score = {k:np.nanmean(v) for k,v in nested_scores.items()}

scores_df = pd.DataFrame(score, index=["Random Forest"]).transpose()
scores_df = pd.concat([scores_df, pd.DataFrame(mean_score, index=["RF (mean)"]).transpose()], axis=1)
scores_df