In [8]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from utils import preprocessing
from imblearn.over_sampling import SMOTE, KMeansSMOTE
from models import train
# import pyswarms as ps
# from pyswarms.discrete.binary import BinaryPSO
import matplotlib.pyplot as plt
# from pyswarms.utils.plotters import plot_contour, plot_surface
# from pyswarms.utils.plotters.formatters import Designer
# from pyswarms.utils.plotters.formatters import Mesher
# import pygad
import optuna
from optuna.samplers import TPESampler, CmaEsSampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
np.random.seed(1066)

In [4]:
def fit_model_multiclass_no_train_test(model, X_train, X_test, y_train, y_test, method):

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    # print('ACCURACY:', round(accuracy_score(y_test, pred), 3))
    # print('BALANCED ACCURACY', round(balanced_accuracy_score(y_test, pred), 3))
    # print('F-SCORE:', round(f1_score(y_test, pred, average='macro'), 3))
    # print('PRECISION:', round(precision_score(y_test, pred, average='macro'), 3))
    # print('RECALL:', round(recall_score(y_test, pred, average='macro'), 3))

    df_metrics = pd.DataFrame(index=['ACCURACY', 'BALANCED ACCURACY', 
        'F-SCORE', 'PRECISION', 'RECALL'], data={method: [round(accuracy_score(y_test, pred), 3), 
                                                        round(balanced_accuracy_score(y_test, pred), 3),
                                                        round(f1_score(y_test, pred, average='macro'), 3),
                                                        round(precision_score(y_test, pred, average='macro'), 3),
                                                        round(recall_score(y_test, pred, average='macro'), 3),
                                                        ]})
    return model, df_metrics

In [12]:
# accepted_class = ['SEK', 'ACK', 'NEV', 'CBC', 'CEC', 'MEL']
# df = pd.read_excel('espectros_multiclass.xlsx')
# df = df[df['Classe'].isin(accepted_class)].drop(columns=['N', 'Proj', 'Sample'])
# df['y'] = LabelEncoder().fit_transform(df['Classe'])
# print(df['Classe'].value_counts())
# df_train, df_test = train_test_split(df, test_size=0.30, random_state=42, stratify=df['y'])
# print(df_train['Classe'].value_counts())
# print(df_test['Classe'].value_counts())

# df_train.reset_index(drop=True).to_excel('multilabel_data/treinamento-raw-multiclasse.xlsx', index=False)
# df_test.reset_index(drop=True).to_excel('multilabel_data/teste-multiclasse.xlsx', index=False)

# # SMOTE
# sm = SMOTE(random_state=1066, k_neighbors=3)
# X_smote, y_smote = sm.fit_resample(df_train.drop(columns=['Classe', 'y']), df_train['y'])
# df_smote = pd.concat([pd.DataFrame(X_smote), pd.DataFrame(y_smote)], axis=1)
# df_smote.to_excel('multilabel_data/treinamento-smote-raw-multiclasse.xlsx', index=False)

# X_train = df_train.drop(columns=['Classe', 'y'])
# y_train = df_train['y']
# X_test = df_test.drop(columns=['Classe', 'y'])
# y_test = df_test['y']

# X_snv_train = pd.DataFrame(preprocessing.SNV(X_train), columns=X_train.columns)
# X_snv_test = pd.DataFrame(preprocessing.SNV(X_test), columns=X_test.columns)

# # GENERATE FEATURES DATASET
# X_features_train = preprocessing.create_features(
#         X_snv_train, n_subsets=26, mode='stats')

# df_features_train = pd.concat([X_features_train, y_train.reset_index(drop=True)], axis=1)
# df_features_train.to_excel('multilabel_data/treinamento-raw-features-multiclass.xlsx')

# X_features_test = preprocessing.create_features(
#         X_snv_test, n_subsets=26, mode='stats')

# df_features_test = pd.concat([X_features_test, y_test.reset_index(drop=True)], axis=1)
# df_features_test.to_excel('multilabel_data/teste-features-multiclass.xlsx')

# X_snv_train = pd.DataFrame(preprocessing.SNV(X_smote), columns=X_smote.columns)

# # GENERATE FEATURES DATASET
# X_features_train = preprocessing.create_features(
#         X_snv_train, n_subsets=26, mode='stats')

# df_features_train = pd.concat([X_features_train, y_smote.reset_index(drop=True)], axis=1)
# df_features_train.to_excel('multilabel_data/treinamento-smote-features-multiclass.xlsx')

ACK    296
CBC    158
SEK    107
NEV     41
CEC     38
MEL      6
Name: Classe, dtype: int64

In [9]:
# RAW DATA
df_train = pd.read_excel('multilabel_data/treinamento-raw-multiclasse.xlsx')
df_test = pd.read_excel('multilabel_data/teste-multiclasse.xlsx')

X_train = df_train.drop(columns=['Classe', 'y'])
y_train = df_train['y']
X_test = df_test.drop(columns=['Classe', 'y'])
y_test = df_test['y']

X_snv_train = pd.DataFrame(preprocessing.SNV(X_train), columns=X_train.columns)
X_snv_test = pd.DataFrame(preprocessing.SNV(X_test), columns=X_test.columns)

X_features_train = pd.read_excel('multilabel_data/treinamento-raw-features-multiclass.xlsx', index_col=0).drop(columns=['y'])
X_features_test = pd.read_excel('multilabel_data/teste-features-multiclass.xlsx', index_col=0).drop(columns=['y'])

print('No preprocess')
print("X: ", X_train.shape, X_test.shape)
print("y: ", y_train.shape, y_test.shape)

print('SNV')
print("X: ", X_snv_train.shape, X_snv_test.shape)

print('Features')
print(X_features_train.shape, X_features_test.shape)

No preprocess
X:  (452, 125) (194, 125)
y:  (452,) (194,)
SNV
X:  (452, 125) (194, 125)
Features
(452, 312) (194, 312)


In [17]:
# SMOTE

df_train = pd.read_excel('multilabel_data/treinamento-smote-raw-multiclasse.xlsx')
df_test = pd.read_excel('multilabel_data/teste-multiclasse.xlsx')

X_train = df_train.drop(columns=['y'])
y_train = df_train['y']
X_test = df_test.drop(columns=['Classe', 'y'])
y_test = df_test['y']

X_snv_train = pd.DataFrame(preprocessing.SNV(X_train), columns=X_train.columns)
X_snv_test = pd.DataFrame(preprocessing.SNV(X_test), columns=X_test.columns)

X_features_train = pd.read_excel('multilabel_data/treinamento-smote-features-multiclass.xlsx', index_col=0).drop(columns=['y'])
X_features_test = pd.read_excel('multilabel_data/teste-features-multiclass.xlsx', index_col=0).drop(columns=['y'])

print('No preprocess')
print(X_train.shape, X_test.shape)

print('SNV')
print(X_snv_train.shape, X_snv_test.shape)

print('Features')
print(X_features_train.shape, X_features_test.shape)

No preprocess
(1242, 125) (194, 125)
SNV
(1242, 125) (194, 125)
Features
(1242, 312) (194, 312)


#### No Preprocessing

In [11]:
# XGBOOST

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        "objective": "multi:softmax",
        "eval_metric": "f1",
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 25),
        'max_depth': trial.suggest_int('max_depth', 1, 15, step=1),
        'n_estimators': trial.suggest_int('n_estimators', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 20, step=1),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 20, step=1)
    }

    model = XGBClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_train, X_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

# study = optuna.create_study(sampler=TPESampler(), direction='maximize')
# study.optimize(objective, n_trials=200)

# print("Number of finished trials: {}".format(len(study.trials)))

# print("Best trial:")
# trial = study.best_trial

# print("Value: {}".format(trial.value))

# print("Params: ")
# for key, value in trial.params.items():
#     print("{}: {}".format(key, value))

# params = {
#     "objective": "multi:softmax",
#     "eval_metric": "f1",
#     "scale_pos_weight": trial.params['scale_pos_weight'],
#     "max_depth": trial.params['max_depth'],
#     "n_estimators": trial.params['n_estimators'],
#     "learning_rate": trial.params['learning_rate'],
#     "colsample_bytree": trial.params['colsample_bytree'],
#     "subsample": trial.params['subsample'],
#     "reg_alpha": trial.params['reg_alpha'],
#     "reg_lambda": trial.params['reg_lambda']
#     }

params = {
    "objective": "multi:softmax",
    "eval_metric": "f1",
    # 'scale_pos_weight': 6,
    # 'max_depth': 7,
    # 'n_estimators': 74,
    # 'learning_rate': 0.7081324139034884,
    # 'colsample_bytree': 0.8084301280514788,
    # 'subsample': 0.45732257564871703,
    # 'reg_alpha': 9,
    # 'reg_lambda': 1,
}

model = XGBClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_train, X_test, y_train, y_test, 'model')

df_metrics_xgboost

XGBoostError: [08:14:59] ../src/metric/metric.cc:49: Unknown metric function f1
Stack trace:
  [bt] (0) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x251a2d) [0x7f5b13e80a2d]
  [bt] (1) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x251bf1) [0x7f5b13e80bf1]
  [bt] (2) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x21566b) [0x7f5b13e4466b]
  [bt] (3) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x20fa69) [0x7f5b13e3ea69]
  [bt] (4) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x68) [0x7f5b13cd8688]
  [bt] (5) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/lib-dynload/../../libffi.so.7(+0x69dd) [0x7f5b6bf059dd]
  [bt] (6) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/lib-dynload/../../libffi.so.7(+0x6067) [0x7f5b6bf05067]
  [bt] (7) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x319) [0x7f5b6bf1e1e9]
  [bt] (8) /home/flavioloss/miniconda3/envs/starfish/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0x13c95) [0x7f5b6bf1ec95]



In [12]:
# CATBOOST

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        'verbose': 0,
        'random_seed': 1066,
        'boosting_type': trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        'max_depth': trial.suggest_int('max_depth', 1, 16), 
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 40),
        'n_estimators': trial.suggest_int('n_estimators', 1, 200),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1)
    }

    model = CatBoostClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_train, X_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
        'verbose': 0,
        'random_seed': 1066,
        'objective': 'MultiClass',
        'boosting_type': trial.params['boosting_type'],
        'max_depth': trial.params['max_depth'], 
        'l2_leaf_reg': trial.params['l2_leaf_reg'],
        'n_estimators': trial.params['n_estimators'],
        'rsm': trial.params['rsm'],
        'learning_rate': trial.params['learning_rate']
    }

params = {
    'verbose': 0,
    'random_seed': 1066,
    'objective': 'MultiClass',
    'boosting_type': 'Ordered', 
    'max_depth': 1, 
    'l2_leaf_reg': 26, 
    'n_estimators': 158, 
    'rsm': 0.6536290864675189, 
    'learning_rate': 0.5156903366818304
    }

model = CatBoostClassifier(**params)

model, df_metrics = fit_model_multiclass_no_train_test(model, 
    X_train, X_test, y_train, y_test, 'metrics')

df_metrics

[32m[I 2023-01-27 08:15:25,200][0m A new study created in memory with name: no-name-cb30fa97-48c0-4b11-8ef6-a032b6319d0d[0m


In [None]:
# LIGHTGBM

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        'objective': 'multiclass',
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 15),
        "n_estimators": trial.suggest_int('n_estimators', 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 5000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    model = LGBMClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_train, X_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
    'objective': 'multiclass',
    'scale_pos_weight': trial.params['scale_pos_weight'],
    'n_estimators': trial.params['n_estimators'],
    'learning_rate': trial.params['learning_rate'],
    'num_leaves': trial.params['num_leaves'],
    'max_depth': trial.params['max_depth']
}

model = LGBMClassifier(**params)

model, df_metrics = fit_model_multiclass_no_train_test(model, 
    X_train, X_test, y_train, y_test, 'df_metrics')

df_metrics

In [20]:
# CNN

#### SNV Data

In [None]:
# XGBOOST

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        "objective": "multi:softmax",
        "eval_metric": "auc",
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 25),
        'max_depth': trial.suggest_int('max_depth', 1, 15, step=1),
        'n_estimators': trial.suggest_int('n_estimators', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 20, step=1),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 20, step=1)
    }

    model = XGBClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_snv_train, X_snv_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
    "objective": "multi:softmax",
    "eval_metric": "f1",
    "scale_pos_weight": trial.params['scale_pos_weight'],
    "max_depth": trial.params['max_depth'],
    "n_estimators": trial.params['n_estimators'],
    "learning_rate": trial.params['learning_rate'],
    "colsample_bytree": trial.params['colsample_bytree'],
    "subsample": trial.params['subsample'],
    "reg_alpha": trial.params['reg_alpha'],
    "reg_lambda": trial.params['reg_lambda']
    }

model = XGBClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_snv_train, X_snv_test, y_train, y_test, 'xgboost')

In [None]:
# CATBOOST

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        'verbose': 0,
        'random_seed': 1066,
        'boosting_type': trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        'max_depth': trial.suggest_int('max_depth', 1, 16), 
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 40),
        'n_estimators': trial.suggest_int('n_estimators', 1, 200),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1)
    }

    model = CatBoostClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_snv_train, X_snv_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
        'verbose': 0,
        'random_seed': 1066,
        'objective': 'MultiClass',
        'boosting_type': trial.params['boosting_type'],
        'max_depth': trial.params['max_depth'], 
        'l2_leaf_reg': trial.params['l2_leaf_reg'],
        'n_estimators': trial.params['n_estimators'],
        'rsm': trial.params['rsm'],
        'learning_rate': trial.params['learning_rate']
    }

model = CatBoostClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_snv_train, X_snv_test, y_train, y_test, 'xgboost')

In [None]:
# LIGHTGBM

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        'objective': 'multiclass',
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 15),
        "n_estimators": trial.suggest_int('n_estimators', 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 5000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    model = LGBMClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_snv_train, X_snv_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
    'objective': 'multiclass',
    'scale_pos_weight': trial.params['scale_pos_weight'],
    'n_estimators': trial.params['n_estimators'],
    'learning_rate': trial.params['learning_rate'],
    'num_leaves': trial.params['num_leaves'],
    'max_depth': trial.params['max_depth']
}

model = LGBMClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_snv_train, X_snv_test, y_train, y_test, 'xgboost')

In [None]:
# CNN

#### Features Data

In [None]:
# XGBOOST

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        "objective": "multi:softmax",
        "eval_metric": "auc",
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 25),
        'max_depth': trial.suggest_int('max_depth', 1, 15, step=1),
        'n_estimators': trial.suggest_int('n_estimators', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 20, step=1),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 20, step=1)
    }

    model = XGBClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_features_train, X_features_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
    "objective": "multi:softmax",
    "eval_metric": "f1",
    "scale_pos_weight": trial.params['scale_pos_weight'],
    "max_depth": trial.params['max_depth'],
    "n_estimators": trial.params['n_estimators'],
    "learning_rate": trial.params['learning_rate'],
    "colsample_bytree": trial.params['colsample_bytree'],
    "subsample": trial.params['subsample'],
    "reg_alpha": trial.params['reg_alpha'],
    "reg_lambda": trial.params['reg_lambda']
    }

model = XGBClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_features_train, X_features_test, y_train, y_test, 'xgboost')

In [None]:
# CATBOOST

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        'verbose': 0,
        'random_seed': 1066,
        'boosting_type': trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        'max_depth': trial.suggest_int('max_depth', 1, 16), 
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 40),
        'n_estimators': trial.suggest_int('n_estimators', 1, 200),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1)
    }

    model = CatBoostClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_features_train, X_features_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
        'verbose': 0,
        'random_seed': 1066,
        'objective': 'MultiClass',
        'boosting_type': trial.params['boosting_type'],
        'max_depth': trial.params['max_depth'], 
        'l2_leaf_reg': trial.params['l2_leaf_reg'],
        'n_estimators': trial.params['n_estimators'],
        'rsm': trial.params['rsm'],
        'learning_rate': trial.params['learning_rate']
    }

model = CatBoostClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_features_train, X_features_test, y_train, y_test, 'xgboost')

In [None]:
# LIGHTGBM

def objective(trial):
    global  X_train, X_test, y_train, y_test, X_snv_train, X_snv_test, X_features_train, X_features_test

    params = {
        'objective': 'multiclass',
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 15),
        "n_estimators": trial.suggest_int('n_estimators', 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 5000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    model = LGBMClassifier(**params)
    
    model, metrics_df = fit_model_multiclass_no_train_test(model,
        X_features_train, X_features_test, y_train, y_test, "model")


    return metrics_df['model'].values[0]

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

params = {
    'objective': 'multiclass',
    'scale_pos_weight': trial.params['scale_pos_weight'],
    'n_estimators': trial.params['n_estimators'],
    'learning_rate': trial.params['learning_rate'],
    'num_leaves': trial.params['num_leaves'],
    'max_depth': trial.params['max_depth']
}

model = LGBMClassifier(**params)

model, df_metrics_xgboost = fit_model_multiclass_no_train_test(model, 
    X_features_train, X_features_test, y_train, y_test, 'xgboost')

In [21]:
# CNN

callbacks_list = [
    # keras.callbacks.ModelCheckpoint(
    #     filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
    #     monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
]


model = Sequential()
model.add(Conv1D(256, 2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
model.add(Conv1D(128, 2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
model.add(Conv1D(64, 2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
model.add(Conv1D(32, 2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
model.add(Dense(16, activation="relu"))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(6, activation = 'softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', 
     optimizer = "adam",               
              metrics = ['accuracy'])
print(model.summary())

model.fit(X_features_train, y_train, batch_size=16, epochs=200, 
        #   callbacks=callbacks_list, 
          verbose=1)

acc = model.evaluate(X_features_train, y_train)
print("Loss:", acc[0], " Accuracy:", acc[1])

pred = model.predict(X_features_test)
pred_y = pred.argmax(axis=-1)

cm = confusion_matrix(y_test, pred_y)
print(cm)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow import keras
import tensorflow as tf

def eval_model_multiclass(y_test, pred, method):

    # print('ACCURACY:', round(accuracy_score(y_test, pred), 3))
    # print('BALANCED ACCURACY', round(balanced_accuracy_score(y_test, pred), 3))
    # print('F-SCORE:', round(f1_score(y_test, pred, average='macro'), 3))
    # print('PRECISION:', round(precision_score(y_test, pred, average='macro'), 3))
    # print('RECALL:', round(recall_score(y_test, pred, average='macro'), 3))

    df_metrics = pd.DataFrame(index=['ACCURACY', 'BALANCED ACCURACY', 
        'F-SCORE', 'PRECISION', 'RECALL'], data={method: [round(accuracy_score(y_test, pred), 3), 
                                                                    round(balanced_accuracy_score(y_test, pred), 3),
                                                                    round(f1_score(y_test, pred, average='macro'), 3),
                                                                    round(precision_score(y_test, pred, average='macro'), 3),
                                                                    round(recall_score(y_test, pred, average='macro'), 3),
                                                                    ]})
    return df_metrics


def objective(trial):
    global X_snv_train, X_snv_test, y_train, y_test, X_features_train, X_features_test
    tf.random.set_seed(1066)

    n_conv_layers = trial.suggest_int('n_conv_layers', 1, 3)
    # n_deep_layers = trial.suggest_int('n_deep_layers', 1, 3)
    model = Sequential()
    if n_conv_layers >= 1:
        model.add(Conv1D(trial.suggest_int('conv_neurons_1', 64, 256), 
            2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
    if n_conv_layers >= 2:
        model.add(Conv1D(trial.suggest_int('conv_neurons_2', 64, 256), 
            2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
    if n_conv_layers >= 3:
        model.add(Conv1D(trial.suggest_int('conv_neurons_3', 64, 256), 
            2, activation="relu", input_shape=(X_features_train.shape[1], X_features_train.shape[2])))
    model.add(Dense(trial.suggest_int('deep_neurons_1', 64, 256), activation="relu"))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(6, activation = 'softmax'))
    model.compile(loss = 'sparse_categorical_crossentropy', 
          optimizer=tf.keras.optimizers.Adam(
          learning_rate=trial.suggest_categorical('learning_rate', [0.01, 0.001, 0.0001])),               
          metrics = ['accuracy'])

    model.fit(X_features_train, y_train, 
      batch_size=trial.suggest_int('batch_size', 16, 128),
      epochs=200, 
      verbose=0)
    
    pred = model.predict(X_features_test)
    pred_y = pred.argmax(axis=-1)

    df_metrics_neural_net = eval_model_multiclass(y_test, pred_y, method='NeuralNet_Features')
    
    return df_metrics_neural_net['NeuralNet_Features'].values[0]