# Load optuna library

In [1]:
import numpy as np
import pandas as pd
import os, sys 
import optuna

# Load the dataset

In [2]:
from HELPpy.preprocess.loaders import load_features
from HELPpy.utility.utils import pandas_readcsv
path = '../../data'
tissue = 'Kidney'
attributes = load_features([os.path.join(path, f'{tissue}_BIO.csv'), 
                            os.path.join(path, f'{tissue}_CCcfs.csv'),
                            os.path.join(path, f'{tissue}_EmbN2V_128.csv')], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=False, show_progress=True)
labelnme = f'{tissue}_HELP.csv'
label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

Kidney_CCcfs.csv:   0%|          | 0/19298 [00:00<?, ?it/s]

Kidney_EmbN2V_128.csv:   0%|          | 0/19314 [00:00<?, ?it/s]

Kidney_HELP.csv:   0%|          | 0/17829 [00:00<?, ?it/s]

# Start tuning session

In [4]:
tissue = 'Kidney'
from __future__ import annotations
from sklearn.metrics import *
from HELPpy.models.prediction import VotingEnsembleLGBM
from optuna import Trial

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer
from HELPpy.models.prediction import VotingEnsembleLGBM
def objective_cv(trial : Trial, X : pd.DataFrame, y : np.ndarray | pd.Series, random_state : int=42):
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'n_voters': trial.suggest_int('n_voters', 1, 20, step=1),
        'n_estimators': trial.suggest_int('n_estimators', 60, 200, step=20),
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
    }
    gbm = VotingEnsembleLGBM(**params)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    ba_scorer = make_scorer(balanced_accuracy_score)
    scores = cross_val_score(gbm, X, y, scoring=ba_scorer, cv=kf)
    #return np.min([np.mean(scores), np.median([scores])])
    return np.mean(scores)

savepath = '../results'
study = optuna.create_study(
    study_name=f'velgbm_{tissue}',
    direction='maximize',
    load_if_exists=True,
    storage=f'sqlite:///{savepath}/veLGBM_{tissue}_ba_cv.db'
    )
study.optimize(lambda trial: objective_cv(trial, attributes, label), n_trials=50)
best_params = study.best_params
print('Best Params:', best_params)
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_ba_cv_2.csv'))
df.sort_values('value', ascending=False)

[I 2024-06-14 18:23:04,652] Using an existing study with name 'velgbm_Kidney' instead of creating a new one.


# Test the best model

In [5]:
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(**study.best_params)
# train using create_model
df_scores, scores, predictions = k_fold_cv(attributes, label, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

{0: 0, 1: 1}
label
0        15994
1         1242
Name: count, dtype: int64



5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,measure
ROC-AUC,0.9574±0.0045
Accuracy,0.9158±0.0015
BA,0.8860±0.0108
Sensitivity,0.8511±0.0238
Specificity,0.9208±0.0027
MCC,0.5845±0.0101
CM,"[[14728, 1266], [185, 1057]]"


In [6]:
#df = pd.read_csv(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv').sort_values(['values_1', 'values_0'], ascending=[False, False]).rename(columns={'values_0': 'BA', 'values_1': 'MCC'})
df = pd.read_csv(f'velgbm_Kidney_hypersearch_ba_cv.csv').sort_values('value', ascending=False).rename(columns={'value': 'BA'})
df.columns = df.columns.str.replace(r'params_', '')
selcolumns = ['boosting_type',	'learning_rate',	'n_estimators',	'n_voters', 'BA']
stds = df.std(numeric_only=True)
#df['BA+MCC'] = df.apply(lambda x: x.BA * stds.loc['BA'] + x.MCC * stds.loc['MCC'], axis=1)
#df['BA+MCC'] = df.apply(lambda x: x.BA  + x.MCC , axis=1)
print(df[selcolumns].sort_values('BA', ascending=False).to_latex())

\begin{tabular}{llrrrr}
\toprule
 & boosting_type & learning_rate & n_estimators & n_voters & BA \\
\midrule
44 & gbdt & 0.098461 & 200 & 8 & 0.892022 \\
33 & gbdt & 0.081512 & 200 & 8 & 0.890714 \\
43 & gbdt & 0.049017 & 200 & 8 & 0.888816 \\
34 & gbdt & 0.079823 & 200 & 7 & 0.887190 \\
23 & gbdt & 0.065070 & 140 & 13 & 0.886479 \\
48 & gbdt & 0.045185 & 180 & 9 & 0.886413 \\
35 & gbdt & 0.076854 & 200 & 7 & 0.886389 \\
21 & gbdt & 0.050629 & 160 & 9 & 0.886107 \\
49 & gbdt & 0.057144 & 200 & 8 & 0.886095 \\
37 & gbdt & 0.077908 & 200 & 7 & 0.886046 \\
32 & gbdt & 0.034572 & 180 & 11 & 0.885119 \\
22 & gbdt & 0.039571 & 180 & 12 & 0.884895 \\
41 & gbdt & 0.051064 & 200 & 7 & 0.884837 \\
29 & gbdt & 0.030759 & 180 & 10 & 0.884531 \\
6 & gbdt & 0.065527 & 100 & 12 & 0.884275 \\
0 & gbdt & 0.031088 & 180 & 11 & 0.884031 \\
31 & gbdt & 0.031174 & 180 & 10 & 0.883759 \\
15 & gbdt & 0.041630 & 160 & 9 & 0.883406 \\
24 & gbdt & 0.040524 & 140 & 15 & 0.882707 \\
20 & gbdt & 0.099690 & 60 & 10

# Load exisiting study from DB

In [19]:
import optuna
study = optuna.load_study(study_name='velgbm_Kidney', storage="sqlite:///../../results/veLGBM_Kidney_ba_cv.db")

RuntimeError: The runtime optuna version 3.6.1 is no longer compatible with the table schema (set up by optuna 3.1.1). Please execute `$ optuna storage upgrade --storage $STORAGE_URL` for upgrading the storage.

In [24]:
import pandas as pd
print(pd.read_csv("velgbm_Kidney_hypersearch_ba_cv.csv").replace({'':'','':'','':'','':''})sort_values('value', ascending=False).to_latex())

\begin{tabular}{lrrrllllrrrl}
\toprule
 & Unnamed: 0 & number & value & datetime_start & datetime_complete & duration & params_boosting_type & params_learning_rate & params_n_estimators & params_n_voters & state \\
\midrule
37 & 37 & 37 & 0.893151 & 2024-06-05 11:48:54.638670 & 2024-06-05 11:50:51.129284 & 0 days 00:01:56.490614 & gbdt & 0.094505 & 200 & 13 & COMPLETE \\
15 & 15 & 15 & 0.891459 & 2024-06-05 11:16:36.759285 & 2024-06-05 11:17:48.725832 & 0 days 00:01:11.966547 & gbdt & 0.098300 & 140 & 10 & COMPLETE \\
44 & 44 & 44 & 0.890954 & 2024-06-05 12:01:09.467990 & 2024-06-05 12:03:01.279431 & 0 days 00:01:51.811441 & gbdt & 0.076452 & 200 & 12 & COMPLETE \\
43 & 43 & 43 & 0.890826 & 2024-06-05 11:59:18.529245 & 2024-06-05 12:01:09.439684 & 0 days 00:01:50.910439 & gbdt & 0.075168 & 200 & 12 & COMPLETE \\
41 & 41 & 41 & 0.890602 & 2024-06-05 11:55:19.637376 & 2024-06-05 11:57:14.427060 & 0 days 00:01:54.789684 & gbdt & 0.078591 & 200 & 13 & COMPLETE \\
33 & 33 & 33 & 0.890241 & 