# Load optuna library

In [2]:
import numpy as np
import pandas as pd
import os, sys 
import optuna

# Load the dataset and split

In [3]:
from HELPpy.preprocess.loaders import load_features
from HELPpy.utility.utils import pandas_readcsv
from sklearn.model_selection import train_test_split
path = '../../data'
tissue = 'Kidney'
attributes = load_features([os.path.join(path, f'{tissue}_BIO.csv'), 
                            #os.path.join(path, f'{tissue}_CCcfs.csv'),
                            os.path.join(path, f'{tissue}_EmbN2V_128.csv')
                           ], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=False, show_progress=True)
labelnme = f'{tissue}_HELP.csv'
label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

Kidney_EmbN2V_128.csv:   0%|          | 0/19314 [00:00<?, ?it/s]

Kidney_HELP.csv:   0%|          | 0/17829 [00:00<?, ?it/s]

  label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})


# Start tuning session

In [6]:
tissue = 'Kidney'
from sklearn.metrics import *
from HELPpy.models.prediction import VotingEnsembleLGBM
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'n_voters': trial.suggest_int('n_voters', 1, 20, step=1),
        #'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
    }
    gbm = VotingEnsembleLGBM(**params)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    ba = balanced_accuracy_score(y_test, preds)
    #mcc = matthews_corrcoef(y_test, preds)
    return ba#, mcc

study = optuna.create_study(
    study_name=f'velgbm_{tissue}',
    #directions=["minimize", "maximize"],
    direction='maximize',
    load_if_exists=True,
    #storage=f'sqlite:///{savepath}/gat_{name}.db'
    )
study.optimize(objective, n_trials=50)
#best_params = study.best_params
#print('Best Params:', best_params)
df = study.trials_dataframe()
#df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv'))
print(df.head())


[I 2024-06-04 16:13:04,422] A new study created in memory with name: velgbm_Kidney
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
[I 2024-06-04 16:13:47,941] Trial 0 finished with value: 0.8406585425586718 and parameters: {'learning_rate': 0.0688291506446236, 'n_voters': 10, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.8406585425586718.
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
[I 2024-06-04 16:14:33,709] Trial 1 finished with value: 0.8401868882639575 and parameters: {'learning_rate': 0.08281998218599068, 'n_voters': 11, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.8406585425586718.
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
[I 2024-06-04 16:15:05,624] Trial 2 finished with value: 0.5 and parameters: {'learning_rate

   number     value             datetime_start          datetime_complete  \
0       0  0.840659 2024-06-04 16:13:04.423539 2024-06-04 16:13:47.940819   
1       1  0.840187 2024-06-04 16:13:47.943669 2024-06-04 16:14:33.709639   
2       2  0.500000 2024-06-04 16:14:33.710131 2024-06-04 16:15:05.624432   
3       3  0.738861 2024-06-04 16:15:05.624908 2024-06-04 16:16:00.426407   
4       4  0.820660 2024-06-04 16:16:00.426874 2024-06-04 16:17:03.621266   

                duration params_boosting_type  params_learning_rate  \
0 0 days 00:00:43.517280                 dart              0.068829   
1 0 days 00:00:45.765970                 dart              0.082820   
2 0 days 00:00:31.914301                 dart              0.002568   
3 0 days 00:00:54.801499                 dart              0.001352   
4 0 days 00:01:03.194392                 dart              0.008155   

   params_n_voters     state  
0               10  COMPLETE  
1               11  COMPLETE  
2                

In [7]:
df.sort_values('value', ascending=False)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting_type,params_learning_rate,params_n_voters,state
29,29,0.866095,2024-06-04 16:32:18.407411,2024-06-04 16:33:04.224870,0 days 00:00:45.817459,gbdt,0.097953,16,COMPLETE
34,34,0.863404,2024-06-04 16:36:51.591944,2024-06-04 16:37:34.745504,0 days 00:00:43.153560,gbdt,0.098955,16,COMPLETE
38,38,0.863154,2024-06-04 16:40:00.699507,2024-06-04 16:40:54.566907,0 days 00:00:53.867400,gbdt,0.066346,16,COMPLETE
39,39,0.862407,2024-06-04 16:40:54.567342,2024-06-04 16:41:46.128273,0 days 00:00:51.560931,gbdt,0.062712,16,COMPLETE
42,42,0.862407,2024-06-04 16:43:16.717273,2024-06-04 16:44:16.974104,0 days 00:01:00.256831,gbdt,0.07691,16,COMPLETE
35,35,0.862207,2024-06-04 16:37:34.745974,2024-06-04 16:38:12.982585,0 days 00:00:38.236611,gbdt,0.072896,16,COMPLETE
41,41,0.861909,2024-06-04 16:42:29.841382,2024-06-04 16:43:16.716730,0 days 00:00:46.875348,gbdt,0.062792,16,COMPLETE
45,45,0.861313,2024-06-04 16:45:37.358545,2024-06-04 16:46:21.482200,0 days 00:00:44.123655,gbdt,0.059643,17,COMPLETE
30,30,0.861215,2024-06-04 16:33:04.225312,2024-06-04 16:33:59.622669,0 days 00:00:55.397357,gbdt,0.097282,18,COMPLETE
31,31,0.860965,2024-06-04 16:33:59.623152,2024-06-04 16:35:03.857797,0 days 00:01:04.234645,gbdt,0.097682,18,COMPLETE


In [1]:
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch_mcc_ba.csv'))
print(df.head())


NameError: name 'study' is not defined

In [16]:
df.std(numeric_only=True).loc['values_1']

0.029591461750105224

In [9]:
tissue='Kidney'
df = pd.read_csv(f'velgbm_{tissue}_hypersearch_mcc_ba_2.csv').sort_values(['values_1', 'values_0'], ascending=[False, False]).rename(columns={'values_0': 'BA', 'values_1': 'MCC'})
df.columns = df.columns.str.replace(r'params_', '')
selcolumns = ['boosting_type',	'learning_rate',	'n_estimators',	'n_voters', 'BA', 'MCC', 'BA+MCC']
stds = df.std(numeric_only=True)
df['BA+MCC'] = df.apply(lambda x: x.BA * stds.loc['BA'] + x.MCC * stds.loc['MCC'], axis=1)
#df['BA+MCC'] = df.apply(lambda x: x.BA  + x.MCC , axis=1)
print(df[selcolumns].sort_values('BA', ascending=False).to_latex())
study.trials[4].params

\begin{tabular}{llrrrrrr}
\toprule
{} & boosting\_type &  learning\_rate &  n\_estimators &  n\_voters &        BA &       MCC &    BA+MCC \\
\midrule
13 &          gbdt &       0.031506 &           140 &        17 &  0.880371 &  0.480454 &  0.189993 \\
3  &          gbdt &       0.060048 &            70 &        18 &  0.879824 &  0.475868 &  0.189148 \\
45 &          dart &       0.051527 &           180 &        19 &  0.871202 &  0.457520 &  0.184974 \\
8  &          gbdt &       0.009561 &           160 &        19 &  0.861139 &  0.427676 &  0.178673 \\
49 &          gbdt &       0.009383 &           170 &        16 &  0.858190 &  0.446332 &  0.181473 \\
44 &          dart &       0.034823 &            80 &        19 &  0.857923 &  0.426451 &  0.178069 \\
4  &          gbdt &       0.030814 &           190 &         8 &  0.856349 &  0.548890 &  0.198636 \\
47 &          dart &       0.031600 &            90 &        19 &  0.856179 &  0.422366 &  0.177161 \\
12 &          dart &     

NameError: name 'study' is not defined

# Test the best model

In [34]:
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(**study.trials[10].params)
# train using create_model
df_scores, scores, predictions = k_fold_cv(attributes, label, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

{0: 0, 1: 1}
label
0        15994
1         1242
dtype: int64
Classification with VotingEnsembleLGBM...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,measure
ROC-AUC,0.9566±0.0044
Accuracy,0.8934±0.0010
BA,0.8910±0.0114
Sensitivity,0.8938±0.0022
Specificity,0.8881±0.0247
MCC,0.5481±0.0111
CM,"[[14296, 1698], [139, 1103]]"
