# Import Basic Libraries

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

# Import libraries for ml methods
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, matthews_corrcoef, balanced_accuracy_score, \
    f1_score, fbeta_score, recall_score, precision_score, average_precision_score, accuracy_score

# import optuna for hyperparameter tuning
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from optuna.samplers import TPESampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Data loading

In [2]:
data = pd.read_csv('Hepatitis_C.csv')
data

Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,label
0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,0
1,45,0,41.7,73.2,43.6,29.4,6.4,8.89,5.31,71.0,67.4,70.3,0
2,55,0,41.5,59.5,15.4,16.2,6.8,6.35,5.22,80.0,12.4,69.9,0
3,53,0,37.8,98.1,30.5,21.1,4.0,5.02,4.42,94.0,23.2,65.2,0
4,56,1,39.7,66.0,14.2,20.8,3.5,7.48,5.88,66.0,7.2,67.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,62,1,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5,1
200,64,1,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3,1
201,64,1,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0,1
202,46,1,33.0,62.7,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0,1


## Split data to X and y

In [3]:
X = data.drop('label', axis=1).copy()
y = data['label'].copy()

print(X.shape, y.shape)

(204, 12) (204,)


# Normalize data

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

print(X.shape, y.shape)

(204, 12) (204,)


## Define a dict of ML models to be studied

In [5]:
# Define classifiers
classifiers = {
    'LR':  LogisticRegression(),
    'GNB': GaussianNB(),
    'kNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVM': SVC()
}

## Cross Validation 

Here we just perform a simple CV, just to see...

In [6]:
from sklearn.model_selection import cross_validate

def cross_validation_function(clf, scoring, cv):
    
    cv = cross_validate(clf, X, y, scoring=scoring, cv=cv)
    
    mean_ = cv["test_score"].mean()
    return mean_

for name, clf in classifiers.items():
    score = cross_validation_function(clf, 'matthews_corrcoef', 3)
    print(f'{name} Score: {score.round(4)}')

LR Score: 0.656
GNB Score: 0.708
kNN Score: 0.5884
LDA Score: 0.5596
SVM Score: 0.711


Seems like `SVM` performs best at default settings.

## Optuna function for hyperparameter tuning

In [7]:
def objective(trial, name, model):
    # -- Tune estimator algorithm
    if name == 'LR':
        params = {
            'C':trial.suggest_float('C', 1e-3, 1e3),
            'max_iter':trial.suggest_int('max_iter', 1000, 10000)
        }
    elif name == 'GNB':
        params = {
            'var_smoothing':trial.suggest_float('var_smoothing', 1e-13, 1e-3)
        }
    elif name == 'kNN':
        params = {
                'n_neighbors':trial.suggest_int('n_neighbors', 1, 15),
                'weights':trial.suggest_categorical('weights', ['uniform', 'distance'])        
        }
    elif name == 'LDA':
        params = {
            'solver':trial.suggest_categorical('solver', ['svd', 'lsqr', 'eigen']),
        }
        if params['solver'] != 'svd':
            params['shrinkage'] = trial.suggest_float('shrinkage', 0, 1)
        else:
            params['shrinkage'] = None
    elif name == 'SVM':
        params = {
            'C':trial.suggest_float('C', 1e-3, 1e3),
            'gamma':trial.suggest_float('gamma', 1e-3, 1e3),
            'kernel':trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])   
        }
        
    clf = model.set_params(**params)
    
    # -- Cross-validate the features reduced by dimensionality reduction methods
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    score = cross_val_score(clf, X, y, 
                            scoring='f1_macro', cv=cv_inner)
    score = score.mean()
    return score

def optuna_tuning(name, clf):
    _objective = lambda trial: objective(trial, name, clf)

    sampler = TPESampler(seed=42) # create a seed for the sampler for reproducibility
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(_objective, n_trials=200)
    
    model = clf.set_params(**study.best_params)
    model.fit(X, y)

    score = cross_validation_function(clf, 'matthews_corrcoef', 3)
    return score

## Hyperparameter tuning for all models

In [8]:
SCORES = []
for name, clf in tqdm(classifiers.items()):
    score = optuna_tuning(name,clf)
    SCORES.append(score)

100%|█████████████████████████████████████████████| 5/5 [00:17<00:00,  3.59s/it]


In [9]:
results = pd.DataFrame(columns=classifiers.keys()).T
results['score'] = SCORES
results

Unnamed: 0,score
LR,0.746241
GNB,0.708032
kNN,0.598557
LDA,0.583057
SVM,0.730496


The the `Logistic Regression` outperforms all the other models.

# Build nested Cross Validation (nCV) pipeline

For the outer loop we will use K=5 folds and for the inner loop L=3 folds.

In [10]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

TEST_SCORE_NESTED = []
MODEL = []

N_TRIALS = 10
for name, clf in tqdm(classifiers.items()):
    for i in range(N_TRIALS):

        _objective = lambda trial: objective(trial, name, clf)
        sampler = TPESampler(seed=42) # create a seed for the sampler for reproducibility
        study = optuna.create_study(direction="maximize", 
                                    sampler=sampler)
        study.optimize(_objective, n_trials=250)
        model = clf.set_params(**study.best_params)
        model.fit(X, y)

        # Nested CV with parameter optimization
        test_score = cross_val_score(model, X, y, cv=cv_outer, 
                                     scoring='matthews_corrcoef', n_jobs=2)

        TEST_SCORE_NESTED.append(test_score.mean())  
        MODEL.append(name)

  0%|                                                     | 0/5 [00:00<?, ?it/s][33m[W 2023-04-07 10:46:35,472][0m Trial 134 failed with parameters: {'C': 106.20196996353101, 'max_iter': 7520} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/comex/Desktop/python-envs/main-bio/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_10300/1146561653.py", line 12, in <lambda>
    _objective = lambda trial: objective(trial, name, clf)
  File "/tmp/ipykernel_10300/3349314402.py", line 37, in objective
    score = cross_val_score(clf, X, y,
  File "/home/comex/Desktop/python-envs/main-bio/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 515, in cross_val_score
    cv_results = cross_validate(
  File "/home/comex/Desktop/python-envs/main-bio/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 266, in cross_va

KeyboardInterrupt: 

In [None]:
all_scores = pd.DataFrame()
all_scores['model'] = MODEL
all_scores['score'] = TEST_SCORE_NESTED
all_scores