# Import Basic Libraries

In [45]:
import pandas as pd
import numpy as np

from tqdm import tqdm

# Import libraries for ml methods
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer, matthews_corrcoef, balanced_accuracy_score, \
    f1_score, fbeta_score, recall_score, precision_score, average_precision_score, accuracy_score

# Data loading

In [46]:
data = pd.read_csv('Hepatitis_C.csv')
data

Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,label
0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,0
1,45,0,41.7,73.2,43.6,29.4,6.4,8.89,5.31,71.0,67.4,70.3,0
2,55,0,41.5,59.5,15.4,16.2,6.8,6.35,5.22,80.0,12.4,69.9,0
3,53,0,37.8,98.1,30.5,21.1,4.0,5.02,4.42,94.0,23.2,65.2,0
4,56,1,39.7,66.0,14.2,20.8,3.5,7.48,5.88,66.0,7.2,67.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,62,1,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5,1
200,64,1,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3,1
201,64,1,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0,1
202,46,1,33.0,62.7,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0,1


In [47]:
print(data.shape)

(204, 13)


In [48]:
data.dtypes

Age        int64
Sex        int64
ALB      float64
ALP      float64
ALT      float64
AST      float64
BIL      float64
CHE      float64
CHOL     float64
CREA     float64
GGT      float64
PROT     float64
label      int64
dtype: object

In [49]:
data.describeribe()

Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,label
count,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0
mean,47.779412,0.357843,41.20049,66.647549,29.841618,48.688725,16.460294,7.934951,5.212157,86.037255,55.117157,72.811765,0.333333
std,10.615323,0.480545,5.656545,33.630161,30.205844,50.826309,29.763884,2.569216,1.128499,83.340139,78.465442,5.581961,0.472564
min,19.0,0.0,20.0,11.3,0.9,12.2,1.8,1.42,1.43,8.0,7.0,51.0,0.0
25%,40.0,0.0,38.95,52.0,14.9,22.425,5.8,6.6375,4.445,66.625,18.6,70.0,0.0
50%,48.0,0.0,41.9,62.7,21.5,29.8,8.8,7.97,5.22,75.6,28.7,72.75,0.0
75%,56.0,1.0,45.0,76.25,35.025,47.5,14.025,9.6525,5.9575,86.25,64.2,76.1,1.0
max,76.0,1.0,62.9,416.6,258.0,324.0,254.0,16.41,8.28,1079.1,650.9,90.0,1.0


### Missing data 

In [39]:
data.isnull().sum()

Age      0
Sex      0
ALB      0
ALP      0
ALT      0
AST      0
BIL      0
CHE      0
CHOL     0
CREA     0
GGT      0
PROT     0
label    0
dtype: int64

## Split data to X and y

In [40]:
X = data.drop('label', axis=1).copy()
y = data['label'].copy()

print(X.shape, y.shape)

(204, 12) (204,)


# Normalize data

In [42]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

print(X.shape, y.shape)

(204, 12) (204,)


## Define a dict of ML models to be studied

In [43]:
# Define classifiers
classifiers = {
    'LR':  LogisticRegression(),
    'GNB': GaussianNB(),
    'kNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVM': SVC()
}

## Cross Validation 

Here we just perform a simple CV, just to see...

In [44]:
from sklearn.model_selection import cross_validate

def cross_validation_function(clf, scoring, cv):
    
    cv = cross_validate(clf, X, y, scoring=scoring, cv=cv)
    
    mean_ = cv["test_score"].mean()
    return mean_

for name, clf in classifiers.items():
    score = cross_validation_function(clf, 'matthews_corrcoef', 3)
    print(f'{name} Score: {score.round(4)}')

LR Score: 0.656
GNB Score: 0.708
kNN Score: 0.5884
LDA Score: 0.5596
SVM Score: 0.711


Seems like `SVM` performs best at default settings.

## Hyperparameter tuning

In [7]:
# Define hyperparameter grid for each model
LR_param_grid = {
    'C': np.logspace(-3, 3, 100),
}

GNB_param_grid = {
    'var_smoothing': np.logspace(-13, -3, 100)
}

kNN_param_grid = {
    'n_neighbors': np.arange(1, 20),
    'weights': ['uniform', 'distance']
}

LDA_param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
}

SVM_param_grid = {
    'C': np.logspace(-3, 1, 100),
    'gamma': np.logspace(-3, 1, 100),
    'kernel': ['linear', 'rbf']
}

# Define the parameter grids
param_grids = {
    'LR': LR_param_grid,
    'GNB': GNB_param_grid,
    'kNN': kNN_param_grid,
    'LDA': LDA_param_grid,
    'SVM': SVM_param_grid
}

# Perform the randomized search
results = []
for name, clf in tqdm(classifiers.items()):
    search = RandomizedSearchCV(clf, param_distributions=param_grids[name], 
                                n_iter=100, cv=3, scoring='f1_macro')
    search.fit(X, y)
    results.append({
        'name': name,
        'best_params': search.best_params_,
        'best_score': search.best_score_
    })

100%|█████████████████████████████████████████████| 5/5 [00:03<00:00,  1.54it/s]


In [8]:
results

[{'name': 'LR',
  'best_params': {'C': 11.497569953977356},
  'best_score': 0.8672879093866696},
 {'name': 'GNB',
  'best_params': {'var_smoothing': 1e-13},
  'best_score': 0.8341248603108186},
 {'name': 'kNN',
  'best_params': {'weights': 'uniform', 'n_neighbors': 1},
  'best_score': 0.8010820251115941},
 {'name': 'LDA',
  'best_params': {'solver': 'svd'},
  'best_score': 0.7405012984131513},
 {'name': 'SVM',
  'best_params': {'kernel': 'linear',
   'gamma': 1.4174741629268062,
   'C': 4.750810162102798},
  'best_score': 0.8630201529469718}]

The the `SVM` outperforms all the other models.

# Build nested Cross Validation (nCV) pipeline

For the outer loop we will use K=5 folds and for the inner loop L=3 folds.

In [9]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

TEST_SCORE_NESTED = []
MODEL = []

N_TRIALS = 10
for name, clf in tqdm(classifiers.items()):
    for i in range(N_TRIALS):
        search = RandomizedSearchCV(clf, param_distributions=param_grids[name], 
                                    n_iter=100, cv=cv_inner, scoring='f1_macro')
        search.fit(X, y)

        # Nested CV with parameter optimization
        test_score = cross_val_score(search, X, y, cv=cv_outer, 
                                     scoring='matthews_corrcoef', n_jobs=2)

        TEST_SCORE_NESTED.append(test_score.mean())  
        MODEL.append(name)

100%|█████████████████████████████████████████████| 5/5 [02:19<00:00, 27.82s/it]


In [11]:
all_scores = pd.DataFrame()
all_scores['model'] = MODEL
all_scores['score'] = TEST_SCORE_NESTED

result = all_scores.groupby("model")["score"].mean()
result

model
GNB    0.686520
LDA    0.726845
LR     0.794657
SVM    0.841266
kNN    0.698282
Name: score, dtype: float64

Finally, the  “winner”  classification algorithm in 10 trials of nCV is the `SVM`.

## Cross-Validation using the whole dataset

In [39]:
clf = SVC()

SVM_param_grid = {
    'C': np.logspace(-2, 1, 500),
    'gamma': np.logspace(-2, 1, 500),
    'kernel': ['linear', 'rbf']
}

# Perform the randomized search
results = []
search = RandomizedSearchCV(clf, param_distributions=SVM_param_grid, 
                            n_iter=500, cv=5, scoring='f1_macro', verbose=1)
search.fit(X, y)
results.append({
    'name': name,
    'best_params': search.best_params_,
    'best_score': search.best_score_
})

results

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[{'name': 'SVM',
  'best_params': {'kernel': 'rbf',
   'gamma': 0.06662654524581153,
   'C': 8.124930210614048},
  'best_score': 0.9154571020964463}]

In [40]:
# Save final model
final_model = search.best_estimator_

import pickle

filename = 'final_model.pkl'
pickle.dump(final_model, open(filename, 'wb'))

# Load the model from disk

loaded_model = pickle.load(open(filename, 'rb'))