In [1]:
import pandas as pd

from sklearn.metrics import make_scorer, matthews_corrcoef, balanced_accuracy_score, \
    f1_score, fbeta_score, recall_score, precision_score, average_precision_score, accuracy_score

In [2]:
df = pd.read_csv('Hepatitis_C.csv')
df

Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,label
0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,0
1,45,0,41.7,73.2,43.6,29.4,6.4,8.89,5.31,71.0,67.4,70.3,0
2,55,0,41.5,59.5,15.4,16.2,6.8,6.35,5.22,80.0,12.4,69.9,0
3,53,0,37.8,98.1,30.5,21.1,4.0,5.02,4.42,94.0,23.2,65.2,0
4,56,1,39.7,66.0,14.2,20.8,3.5,7.48,5.88,66.0,7.2,67.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,62,1,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5,1
200,64,1,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3,1
201,64,1,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0,1
202,46,1,33.0,62.7,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0,1


## Train/Test split 

In [3]:
from sklearn.model_selection import train_test_split

# -- Separate features and label
# (a) drop target column
X = df.drop(columns=['label'])
# (b) make an array with the target column
y = df['label'].copy()

# -- Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, \
                                                    stratify=y, random_state=42)

print(X_train.shape, X_test.shape)

(163, 12) (41, 12)


## Simple Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
score = matthews_corrcoef(y_pred=y_pred, y_true=y_test)
print(score)

results = pd.DataFrame({'true': y_test, 'pred': y_pred})
display(results.head())

0.8409178658720822


Unnamed: 0,true,pred
37,0,0
191,1,1
20,0,0
149,1,0
21,0,0


# Optuna 

Optuna is an open-source hyperparameter optimization framework for Python. It provides a simple and efficient API for defining and optimizing machine learning models. Optuna automates the process of hyperparameter tuning by intelligently exploring the hyperparameter space and finding the optimal set of hyperparameters that minimize a user-defined objective function.

Optuna uses state-of-the-art algorithms for hyperparameter search, such as Tree-structured Parzen Estimator (TPE), Bayesian optimization, and Optuna's original algorithm called "Pruner". It also supports various machine learning frameworks such as PyTorch, TensorFlow, Keras, and Scikit-learn.

With Optuna, you can define a search space of hyperparameters, specify the objective function that you want to minimize, and run the optimization process. The framework then uses its algorithms to search for the best hyperparameters within the specified search space, and returns the optimal set of hyperparameters that yield the best performance of the objective function.

In [5]:
import optuna
from optuna.samplers import TPESampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [6]:
def objective(trial, model):
    # -- Tune estimator algorithm
    # Wrap the objective inside a lambda and call objective inside it
    params = {
            'C':trial.suggest_loguniform('C', 1e-3, 1e3),
            'gamma':trial.suggest_loguniform('gamma', 1e-3, 1e3)
        }

    clf = model.set_params(**params)
    # -- Cross-validate the features reduced by dimensionality reduction methods
    kfold = StratifiedKFold(n_splits=10)
    score = cross_val_score(clf, X_train, y_train, scoring='matthews_corrcoef', cv=kfold)
    score = score.mean()
    return score


model = SVC()

_objective = lambda trial: objective(trial, model)

sampler = TPESampler(seed=42) # create a seed for the sampler for reproducibility
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(_objective, n_trials=100)

[32m[I 2023-04-06 17:10:50,334][0m A new study created in memory with name: no-name-f427c9c7-bba0-4538-acf2-ccadada80749[0m
  'C':trial.suggest_loguniform('C', 1e-3, 1e3),
  'gamma':trial.suggest_loguniform('gamma', 1e-3, 1e3)
[32m[I 2023-04-06 17:10:50,394][0m Trial 0 finished with value: 0.0 and parameters: {'C': 0.1767016940294795, 'gamma': 506.1576888752306}. Best is trial 0 with value: 0.0.[0m
  'C':trial.suggest_loguniform('C', 1e-3, 1e3),
  'gamma':trial.suggest_loguniform('gamma', 1e-3, 1e3)
[32m[I 2023-04-06 17:10:50,458][0m Trial 1 finished with value: 0.0 and parameters: {'C': 24.658329458549105, 'gamma': 3.907967156822881}. Best is trial 0 with value: 0.0.[0m
  'C':trial.suggest_loguniform('C', 1e-3, 1e3),
  'gamma':trial.suggest_loguniform('gamma', 1e-3, 1e3)
[32m[I 2023-04-06 17:10:50,524][0m Trial 2 finished with value: 0.0 and parameters: {'C': 0.008632008168602538, 'gamma': 0.008629132190071854}. Best is trial 0 with value: 0.0.[0m
  'C':trial.suggest_logun

In [8]:
print(study.best_params)
print(study.best_value)

model = SVC().set_params(**study.best_params)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

f1_score(y_pred=y_pred, y_true=y_test)

{'C': 1.9290764819348207, 'gamma': 0.0011996661220636685}
0.8544451325330676


0.9285714285714286