# Import Basic Libraries

In [1]:
import pandas as pd
import numpy as np

# Import libraries for ml methods
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, matthews_corrcoef, balanced_accuracy_score, \
    f1_score, fbeta_score, recall_score, precision_score, average_precision_score, accuracy_score

# import optuna for hyperparameter tuning
import optuna
from optuna.samplers import TPESampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Data loading

In [2]:
data = pd.read_csv('Hepatitis_C.csv')
data

Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,label
0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,0
1,45,0,41.7,73.2,43.6,29.4,6.4,8.89,5.31,71.0,67.4,70.3,0
2,55,0,41.5,59.5,15.4,16.2,6.8,6.35,5.22,80.0,12.4,69.9,0
3,53,0,37.8,98.1,30.5,21.1,4.0,5.02,4.42,94.0,23.2,65.2,0
4,56,1,39.7,66.0,14.2,20.8,3.5,7.48,5.88,66.0,7.2,67.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,62,1,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5,1
200,64,1,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3,1
201,64,1,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0,1
202,46,1,33.0,62.7,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0,1


## Split data to X and y

In [3]:
X = data.drop('label', axis=1).copy()
y = data['label'].copy()

print(X.shape, y.shape)

(204, 12) (204,)


# Normalize data

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

print(X.shape, y.shape)

(204, 12) (204,)


## Train / Test split of data

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, \
                                                    stratify=y, random_state=42)

### Define the ML models

In [6]:
# Define classifiers
classifiers = {
    'LR':  LogisticRegression(),
    'GNB': GaussianNB(),
    'kNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVM': SVC()
}

## Define functions

In [7]:
def objective(trial, model):
    # -- Tune estimator algorithm
    # Wrap the objective inside a lambda and call objective inside it
    params_svc = {
            'C':trial.suggest_float("C", 0.01, 10),
            'gamma':trial.suggest_float('gamma', 0.01, 1)
        }
    
    if type(model) is SVC:
        params = params_svc    

    clf = model.set_params(**params)
    
    # -- Cross-validate the features reduced by dimensionality reduction methods
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    score = cross_val_score(clf, X_train, y_train, 
                            scoring='f1_macro', cv=cv_inner)
    score = score.mean()
    return score

# Build nested Cross Validation (nCV) pipeline

For the outer loop we will use K=5 folds and for the inner loop L=3 folds.

In [8]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [9]:
test_score_nested = []

N_TRIALS = 3
for i in range(N_TRIALS):

    clf = SVC()
    _objective = lambda trial: objective(trial, clf)
    
    sampler = TPESampler(seed=42) # create a seed for the sampler for reproducibility
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(_objective, n_trials=250)
    model = clf.set_params(**study.best_params)
    
    model.fit(X_train, y_train)
   
    # Nested CV with parameter optimization
    test_score = cross_val_score(model, X_train, y_train, cv=cv_outer, scoring='matthews_corrcoef', n_jobs=2)
    
    test_score_nested.append(test_score.mean())

[32m[I 2023-04-06 18:14:06,204][0m A new study created in memory with name: no-name-b0da8394-0cb7-4b93-8cb7-dfa711f16708[0m
[32m[I 2023-04-06 18:14:06,220][0m Trial 0 finished with value: 0.39847744666992346 and parameters: {'C': 3.7516557872851513, 'gamma': 0.951207163345817}. Best is trial 0 with value: 0.39847744666992346.[0m
[32m[I 2023-04-06 18:14:06,322][0m Trial 1 finished with value: 0.8663878810937634 and parameters: {'C': 7.322619478695937, 'gamma': 0.6026718993550663}. Best is trial 1 with value: 0.8663878810937634.[0m
[32m[I 2023-04-06 18:14:06,340][0m Trial 2 finished with value: 0.8993780213441777 and parameters: {'C': 1.5686262180199408, 'gamma': 0.16443457513284063}. Best is trial 2 with value: 0.8993780213441777.[0m
[32m[I 2023-04-06 18:14:06,361][0m Trial 3 finished with value: 0.40072463768115946 and parameters: {'C': 0.5902552855603126, 'gamma': 0.8675143843171859}. Best is trial 2 with value: 0.8993780213441777.[0m
[32m[I 2023-04-06 18:14:06,381][0

In [10]:
print(model)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

matthews_corrcoef(y_pred=y_pred, y_true=y_test)

SVC(C=9.451914076861565, gamma=0.36687459272561085)


0.7344646618036428

In [11]:
all_scores = pd.DataFrame(test_score_nested)
all_scores

Unnamed: 0,0
0,0.818293
1,0.818293
2,0.818293


In [14]:
# automatic nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# create dataset
X, y = make_classification(n_samples=200, n_features=5, random_state=1)
# configure the cross-validation procedure
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = RandomForestClassifier(random_state=1)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.905 (0.047)
