# Load optuna library

In [2]:
import numpy as np
import pandas as pd
import os, sys 
import optuna

# Load the dataset and split

In [3]:
from HELPpy.preprocess.loaders import load_features
from HELPpy.utility.utils import pandas_readcsv
from sklearn.model_selection import train_test_split
path = '../../data'
tissue = 'Kidney'
attributes = load_features([os.path.join(path, f'{tissue}_BIO.csv'), 
                            os.path.join(path, f'{tissue}_CCcfs.csv'),
                            os.path.join(path, f'{tissue}_EmbN2V_128.csv')
                           ], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=False, show_progress=True)
labelnme = f'{tissue}_HELP.csv'
label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

Kidney_CCcfs.csv:   0%|          | 0/19298 [00:00<?, ?it/s]

Kidney_EmbN2V_128.csv:   0%|          | 0/19314 [00:00<?, ?it/s]

Kidney_HELP.csv:   0%|          | 0/17829 [00:00<?, ?it/s]

# Start tuning session

In [19]:
tissue = 'Kidney'
from sklearn.metrics import *
from HELPpy.models.prediction import VotingEnsembleLGBM
def objective(trial):
    params = {
        'lr': trial.suggest_loguniform('lr', 1e-4, 1e-2),
        'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
    }
    gbm = VotingEnsembleLGBM(**params)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    ba = balanced_accuracy_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    return ba, mcc

study = optuna.create_study(
    study_name=f'velgbm_{tissue}',
    directions=["minimize", "maximize"],
    load_if_exists=True,
    #storage=f'sqlite:///{savepath}/gat_{name}.db'
    )
study.optimize(objective, n_trials=50)
best_params = study.best_params
print('Best Params:', best_params)
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba.csv'))
print(df.head())


[I 2024-05-30 22:59:13,235] A new study created in memory with name: velgbm_Kidney
  'lr': trial.suggest_loguniform('lr', 1e-4, 1e-2),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
  'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
[I 2024-05-30 22:59:46,904] Trial 0 finished with values: [0.8738892689147306, 0.4766843516019192] and parameters: {'lr': 0.00021130662473824654, 'n_voters': 16, 'n_estimators': 60, 'boosting_type': 'dart'}. 
  'lr': trial.suggest_loguniform('lr', 1e-4, 1e-2),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
  'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
[I 2024-05-30 23:00:19,085] Trial 1 finished with values: [0.8738892689147306, 0.4766843516019192] and parameters: {'lr': 0.003058936897508187, 'n_voters': 16, 'n_estimators': 60, 'boosting_type': 'dart'}. 
  'lr': trial.suggest_loguniform('lr', 1e-4, 1e-2),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
  'n_estimators': trial.suggest_int('n_estimator

RuntimeError: A single best trial cannot be retrieved from a multi-objective study. Consider using Study.best_trials to retrieve a list containing the best trials.

In [24]:
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba.csv'))
print(df.head())


   number  values_0  values_1             datetime_start  \
0       0  0.873889  0.476684 2024-05-30 22:59:13.237040   
1       1  0.873889  0.476684 2024-05-30 22:59:46.905270   
2       2  0.682455  0.516213 2024-05-30 23:00:19.086077   
3       3  0.866603  0.503723 2024-05-30 23:00:56.316209   
4       4  0.875229  0.506053 2024-05-30 23:01:27.243873   

           datetime_complete               duration params_boosting_type  \
0 2024-05-30 22:59:46.903947 0 days 00:00:33.666907                 dart   
1 2024-05-30 23:00:19.085195 0 days 00:00:32.179925                 dart   
2 2024-05-30 23:00:56.315368 0 days 00:00:37.229291                 gbdt   
3 2024-05-30 23:01:27.242660 0 days 00:00:30.926451                 dart   
4 2024-05-30 23:02:39.030473 0 days 00:01:11.786600                 dart   

   params_lr  params_n_estimators  params_n_voters  \
0   0.000211                   60               16   
1   0.003059                   60               16   
2   0.004207        

In [33]:
pd.read_csv(f'velgbm_{tissue}_hypersearch_mcc_ba.csv').sort_values(['values_1', 'values_0'], ascending=[False, False])
study.trials[10].params

{'lr': 0.00010976010981843044,
 'n_voters': 11,
 'n_estimators': 130,
 'boosting_type': 'gbdt'}

# Test the best model

In [34]:
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(**study.trials[10].params)
# train using create_model
df_scores, scores, predictions = k_fold_cv(attributes, label, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

{0: 0, 1: 1}
label
0        15994
1         1242
dtype: int64
Classification with VotingEnsembleLGBM...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]