# Load optuna library

In [2]:
import numpy as np
import pandas as pd
import os, sys 
import optuna

# Load the dataset and split

In [3]:
from HELPpy.preprocess.loaders import load_features
from HELPpy.utility.utils import pandas_readcsv
from sklearn.model_selection import train_test_split
path = '../../data'
tissue = 'Kidney'
attributes = load_features([os.path.join(path, f'{tissue}_BIO.csv'), 
                            os.path.join(path, f'{tissue}_CCcfs.csv'),
                            os.path.join(path, f'{tissue}_EmbN2V_128.csv')
                           ], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=False, show_progress=True)
labelnme = f'{tissue}_HELP.csv'
label = pandas_readcsv(os.path.join(path,labelnme), descr=f'{labelnme}', index_col=0).replace({'E': 1, 'aE':0, 'sNE': 0})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

Kidney_CCcfs.csv:   0%|          | 0/19298 [00:00<?, ?it/s]

Kidney_EmbN2V_128.csv:   0%|          | 0/19314 [00:00<?, ?it/s]

Kidney_HELP.csv:   0%|          | 0/17829 [00:00<?, ?it/s]

# Start tuning session

In [16]:
tissue = 'Kidney'
from sklearn.metrics import *
from HELPpy.models.prediction import VotingEnsembleLGBM
def objective(trial):
    params = {
        'lr': trial.suggest_loguniform('lr', 1e-4, 1e-2),
        'n_voters': trial.suggest_int('n_voters', 1, 20, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"])
    }
    gbm = VotingEnsembleLGBM(**params)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    ba = balanced_accuracy_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    return ba, mcc

study = optuna.create_study(
    study_name=f'velgbm_{tissue}',
    direction=["minimize", "maximize"],
    load_if_exists=True,
    #storage=f'sqlite:///{savepath}/gat_{name}.db'
    )
study.optimize(objective, n_trials=50)
best_params = study.best_params
print('Best Params:', best_params)
df = study.trials_dataframe()
df.to_csv(os.path.join(f'velgbm_{tissue}_hypersearch.csv'))
print(df.head())


SyntaxError: invalid syntax (<ipython-input-16-642597212acf>, line 21)

In [12]:
pd.read_csv(f'velgbm_{tissue}_hypersearch.csv').sort_values('value', ascending=False)
study.best_trial

FrozenTrial(number=26, state=TrialState.COMPLETE, values=[0.888324761639347], datetime_start=datetime.datetime(2024, 5, 30, 22, 15, 8, 371116), datetime_complete=datetime.datetime(2024, 5, 30, 22, 16, 2, 200853), params={'lr': 0.0014991586485202525, 'n_voters': 20, 'n_estimators': 100, 'boosting_type': 'gbdt'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'lr': FloatDistribution(high=0.01, log=True, low=0.0001, step=None), 'n_voters': IntDistribution(high=20, log=False, low=1, step=1), 'n_estimators': IntDistribution(high=200, log=False, low=50, step=10), 'boosting_type': CategoricalDistribution(choices=('gbdt', 'dart'))}, trial_id=26, value=None)

# TEst the best model

In [15]:
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(**best_params)
# train using create_model
df_scores, scores, predictions = k_fold_cv(attributes, label, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

{0: 0, 1: 1}
label
0        15994
1         1242
dtype: int64
Classification with VotingEnsembleLGBM...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,measure
ROC-AUC,0.9550±0.0049
Accuracy,0.8322±0.0038
BA,0.8769±0.0081
Sensitivity,0.8247±0.0050
Specificity,0.9291±0.0192
MCC,0.4635±0.0082
CM,"[[13190, 2804], [88, 1154]]"
