# Load optuna library

In [2]:
import numpy as np
import pandas as pd
import os, sys 
import optuna

# Load the dataset and split

In [3]:
from HELPpy.preprocess.loaders import load_features
from HELPpy.utility.utils import pandas_readcsv
from sklearn.model_selection import train_test_split
path = '../../data'
tissue = 'Kidney'
attributes = load_features([os.path.join(path, f'{tissue}_BIO.csv'), 
                            os.path.join(path, f'{tissue}_CCcfs.csv'),
                            os.path.join(path, f'{tissue}_EmbN2V_128.csv')
                           ], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=False, show_progress=True)
labelnme = f'{tissue}_HELP.csv'
label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
#X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
#train = pd.concat([X_train, y_train], axis=1)
#test = pd.concat([X_test, y_test], axis=1)

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

Kidney_EmbN2V_128.csv:   0%|          | 0/19314 [00:00<?, ?it/s]

Kidney_HELP.csv:   0%|          | 0/17829 [00:00<?, ?it/s]

  label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})


# Start tuning session

In [6]:
tissue = 'Kidney'
from sklearn.metrics import *
from HELPpy.models.prediction import VotingEnsembleLGBM
from optuna import Trial

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer
from HELPpy.models.prediction import VotingEnsembleLGBM
def objective_cv(trial : Trial, X : pd.DataFrame, y : np.ndarray | pd.Series, random_state : int=42):
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'n_voters': trial.suggest_int('n_voters', 1, 20, step=1),
        'n_estimators': trial.suggest_int('n_estimators', 60, 200, 20),
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
    }
    gbm = VotingEnsembleLGBM(**params)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    ba_scorer = make_scorer(balanced_accuracy_score)
    scores = cross_val_score(gbm, X, y, scoring=ba_scorer, cv=kf)
    return np.min([np.mean(scores), np.median([scores])])

#def objective(trial):
#    params = {
#        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
#        'n_voters': trial.suggest_int('n_voters', 1, 20, step=1),
#        'n_estimators': trial.suggest_int('n_estimators', 60, 200, 20),
#        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
#    }
#    gbm = VotingEnsembleLGBM(**params)
#    gbm.fit(X_train, y_train)
#    preds = gbm.predict(X_test)
#    ba = balanced_accuracy_score(y_test, preds)
    #mcc = matthews_corrcoef(y_test, preds)
#    return ba#, mcc

study = optuna.create_study(
    study_name=f'velgbm_{tissue}',
    #directions=["minimize", "maximize"],
    direction='maximize',
    load_if_exists=True,
    #storage=f'sqlite:///{savepath}/gat_{name}.db'
    )
study.optimize(lambda trial: objective_cv(trial, attributes, label), n_trials=50)
#best_params = study.best_params
#print('Best Params:', best_params)
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_ba_cv.csv'))
df.sort_values('value', ascending=False)

[I 2024-06-04 16:13:04,422] A new study created in memory with name: velgbm_Kidney
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
[I 2024-06-04 16:13:47,941] Trial 0 finished with value: 0.8406585425586718 and parameters: {'learning_rate': 0.0688291506446236, 'n_voters': 10, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.8406585425586718.
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
[I 2024-06-04 16:14:33,709] Trial 1 finished with value: 0.8401868882639575 and parameters: {'learning_rate': 0.08281998218599068, 'n_voters': 11, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.8406585425586718.
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
[I 2024-06-04 16:15:05,624] Trial 2 finished with value: 0.5 and parameters: {'learning_rate

   number     value             datetime_start          datetime_complete  \
0       0  0.840659 2024-06-04 16:13:04.423539 2024-06-04 16:13:47.940819   
1       1  0.840187 2024-06-04 16:13:47.943669 2024-06-04 16:14:33.709639   
2       2  0.500000 2024-06-04 16:14:33.710131 2024-06-04 16:15:05.624432   
3       3  0.738861 2024-06-04 16:15:05.624908 2024-06-04 16:16:00.426407   
4       4  0.820660 2024-06-04 16:16:00.426874 2024-06-04 16:17:03.621266   

                duration params_boosting_type  params_learning_rate  \
0 0 days 00:00:43.517280                 dart              0.068829   
1 0 days 00:00:45.765970                 dart              0.082820   
2 0 days 00:00:31.914301                 dart              0.002568   
3 0 days 00:00:54.801499                 dart              0.001352   
4 0 days 00:01:03.194392                 dart              0.008155   

   params_n_voters     state  
0               10  COMPLETE  
1               11  COMPLETE  
2                

# Test the best model

In [1]:
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(**bstudy.best_params)
# train using create_model
df_scores, scores, predictions = k_fold_cv(attributes, label, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

NameError: name 'study' is not defined

In [9]:
#tissue='Kidney'
#df = pd.read_csv(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv').sort_values(['values_1', 'values_0'], ascending=[False, False]).rename(columns={'values_0': 'BA', 'values_1': 'MCC'})
#df.columns = df.columns.str.replace(r'params_', '')
#selcolumns = ['boosting_type',	'learning_rate',	'n_estimators',	'n_voters', 'BA', 'MCC', 'BA+MCC']
#stds = df.std(numeric_only=True)
#df['BA+MCC'] = df.apply(lambda x: x.BA * stds.loc['BA'] + x.MCC * stds.loc['MCC'], axis=1)
#df['BA+MCC'] = df.apply(lambda x: x.BA  + x.MCC , axis=1)
#print(df[selcolumns].sort_values('BA', ascending=False).to_latex())
#study.trials[4].params

\begin{tabular}{llrrrrrr}
\toprule
{} & boosting\_type &  learning\_rate &  n\_estimators &  n\_voters &        BA &       MCC &    BA+MCC \\
\midrule
13 &          gbdt &       0.031506 &           140 &        17 &  0.880371 &  0.480454 &  0.189993 \\
3  &          gbdt &       0.060048 &            70 &        18 &  0.879824 &  0.475868 &  0.189148 \\
45 &          dart &       0.051527 &           180 &        19 &  0.871202 &  0.457520 &  0.184974 \\
8  &          gbdt &       0.009561 &           160 &        19 &  0.861139 &  0.427676 &  0.178673 \\
49 &          gbdt &       0.009383 &           170 &        16 &  0.858190 &  0.446332 &  0.181473 \\
44 &          dart &       0.034823 &            80 &        19 &  0.857923 &  0.426451 &  0.178069 \\
4  &          gbdt &       0.030814 &           190 &         8 &  0.856349 &  0.548890 &  0.198636 \\
47 &          dart &       0.031600 &            90 &        19 &  0.856179 &  0.422366 &  0.177161 \\
12 &          dart &     

NameError: name 'study' is not defined