# Load optuna library

In [2]:
import numpy as np
import pandas as pd
import os, sys 
import optuna

# Load the dataset and split

In [2]:
from HELPpy.preprocess.loaders import load_features
from HELPpy.utility.utils import pandas_readcsv
from sklearn.model_selection import train_test_split
path = '../../data'
tissue = 'Kidney'
attributes = load_features([os.path.join(path, f'{tissue}_BIO.csv'), 
                            os.path.join(path, f'{tissue}_CCcfs.csv'),
                            os.path.join(path, f'{tissue}_EmbN2V_128.csv')
                           ], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=False, show_progress=True)
labelnme = f'{tissue}_HELP.csv'
label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

Kidney_CCcfs.csv:   0%|          | 0/18608 [00:00<?, ?it/s]

Kidney_EmbN2V_128.csv:   0%|          | 0/19314 [00:00<?, ?it/s]

Kidney_HELP.csv:   0%|          | 0/17829 [00:00<?, ?it/s]

# Start tuning session

In [4]:
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv'))
print(df.head())


   number  values_0  values_1             datetime_start  \
0       0  0.855121  0.454820 2024-05-31 12:07:52.971263   
1       1  0.830854  0.492501 2024-05-31 12:08:04.485869   
2       2  0.853799  0.453963 2024-05-31 12:08:19.261596   
3       3  0.879824  0.475868 2024-05-31 12:08:39.113490   
4       4  0.856349  0.548890 2024-05-31 12:08:51.744717   

           datetime_complete               duration params_boosting_type  \
0 2024-05-31 12:08:04.484900 0 days 00:00:11.513637                 dart   
1 2024-05-31 12:08:19.260955 0 days 00:00:14.775086                 gbdt   
2 2024-05-31 12:08:39.112988 0 days 00:00:19.851392                 dart   
3 2024-05-31 12:08:51.744074 0 days 00:00:12.630584                 gbdt   
4 2024-05-31 12:09:07.140017 0 days 00:00:15.395300                 gbdt   

   params_learning_rate  params_n_estimators  params_n_voters  \
0              0.022081                   70               14   
1              0.005422                  140        

In [3]:
tissue = 'Kidney'
from sklearn.metrics import *
from HELPpy.models.prediction import VotingEnsembleLGBM
def objective(trial):
    params = {
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
        'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
    }
    gbm = VotingEnsembleLGBM(**params)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    ba = balanced_accuracy_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    return ba, mcc

study = optuna.create_study(
    study_name=f'velgbm_{tissue}',
    directions=["minimize", "maximize"],
    load_if_exists=True,
    #storage=f'sqlite:///{savepath}/gat_{name}.db'
    )
study.optimize(objective, n_trials=50)
#best_params = study.best_params
#print('Best Params:', best_params)
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv'))
print(df.head())


[I 2024-05-31 12:07:52,970] A new study created in memory with name: velgbm_Kidney
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
  'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
[I 2024-05-31 12:08:04,485] Trial 0 finished with values: [0.8551206370922111, 0.4548197742715752] and parameters: {'learning_rate': 0.02208064774498075, 'n_voters': 14, 'n_estimators': 70, 'boosting_type': 'dart'}. 
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
  'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
[I 2024-05-31 12:08:19,261] Trial 1 finished with values: [0.8308542980922702, 0.4925007603206622] and parameters: {'learning_rate': 0.005421880091003205, 'n_voters': 10, 'n_estimators': 140, 'boosting_type': 'gbdt'}. 
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.

RuntimeError: A single best trial cannot be retrieved from a multi-objective study. Consider using Study.best_trials to retrieve a list containing the best trials.

In [1]:
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba.csv'))
print(df.head())


NameError: name 'study' is not defined

In [16]:
df.std(numeric_only=True).loc['values_1']

0.029591461750105224

In [3]:
tissue='Kidney'
df = pd.read_csv(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv').sort_values(['values_1', 'values_0'], ascending=[False, False]).rename(columns={'values_0': 'BA', 'values_1': 'MCC'})
df.columns = df.columns.str.replace(r'params_', '')
selcolumns = ['boosting_type',	'learning_rate',	'n_estimators',	'n_voters', 'BA', 'MCC', 'BA+MCC']
stds = df.std(numeric_only=True)
df['BA+MCC'] = df.apply(lambda x: x.BA * stds.loc['BA'] + x.MCC * stds.loc['MCC'], axis=1)
#df['BA+MCC'] = df.apply(lambda x: x.BA  + x.MCC , axis=1)
print(df[selcolumns].sort_values('BA+MCC', ascending=False).to_latex())
study.trials[4].params

\begin{tabular}{llrrrrrr}
\toprule
{} & boosting\_type &  learning\_rate &  n\_estimators &  n\_voters &        BA &       MCC &    BA+MCC \\
\midrule
4  &          gbdt &       0.030814 &           190 &         8 &  0.856349 &  0.548890 &  0.198636 \\
14 &          dart &       0.085037 &           160 &         9 &  0.854957 &  0.524143 &  0.194268 \\
16 &          gbdt &       0.007777 &           150 &         9 &  0.843017 &  0.514440 &  0.191151 \\
36 &          dart &       0.046511 &            90 &         5 &  0.798540 &  0.542199 &  0.190375 \\
13 &          gbdt &       0.031506 &           140 &        17 &  0.880371 &  0.480454 &  0.189993 \\
3  &          gbdt &       0.060048 &            70 &        18 &  0.879824 &  0.475868 &  0.189148 \\
40 &          dart &       0.024741 &           200 &         6 &  0.815120 &  0.520430 &  0.188727 \\
6  &          gbdt &       0.040661 &           120 &         3 &  0.766208 &  0.551943 &  0.188041 \\
10 &          dart &     

NameError: name 'study' is not defined

# Test the best model

In [34]:
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(**study.trials[10].params)
# train using create_model
df_scores, scores, predictions = k_fold_cv(attributes, label, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

{0: 0, 1: 1}
label
0        15994
1         1242
dtype: int64
Classification with VotingEnsembleLGBM...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,measure
ROC-AUC,0.9566±0.0044
Accuracy,0.8934±0.0010
BA,0.8910±0.0114
Sensitivity,0.8938±0.0022
Specificity,0.8881±0.0247
MCC,0.5481±0.0111
CM,"[[14296, 1698], [139, 1103]]"
