In [4]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

df = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Skin\DADOS\Multiclass\ML\LLNA_curated_multiclass_ML.csv")
df

Unnamed: 0,SMILES,Outcome,Source,CAS No
0,C=CC(C)(O)CCCC(C)C,0,ECHA,18479-49-7
1,O=C1NC2=CC=C([N+](=O)[O-])C=C2N1,0,ECHA,93-84-5
2,CNS(=O)(=O)C1=CC(OC)=C(N)C=C1OC,0,ECHA,49701-24-8
3,CN(C)C1=CC=C(C(C2=CC=C(N(C)C)C=C2)C2=CC=C(N(C)...,0,ECHA,603-48-5
4,CN(C)C(=S)SCCCS(=O)(=O)O,0,ECHA,18880-36-9
...,...,...,...,...
1243,O=[N+]([O-])C1=CC=C(O)C=C1[N+](=O)[O-],1B,SSDB,577-71-9
1244,CCCCCCC#CC(=O)OC,1B,SSDB,111-80-8
1245,O=C1C=CC2=CC(Cl)=CC=C2O1,1B,SSDB,2051-59-4
1246,CCCCCCCC=CC=O,1B,SSDB,3913-71-1


In [5]:
df.dropna(subset=['SMILES'], inplace=True)
df.reset_index(drop=True, inplace=True)

df.dropna(subset=['Outcome'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,SMILES,Outcome,Source,CAS No
0,C=CC(C)(O)CCCC(C)C,0,ECHA,18479-49-7
1,O=C1NC2=CC=C([N+](=O)[O-])C=C2N1,0,ECHA,93-84-5
2,CNS(=O)(=O)C1=CC(OC)=C(N)C=C1OC,0,ECHA,49701-24-8
3,CN(C)C1=CC=C(C(C2=CC=C(N(C)C)C=C2)C2=CC=C(N(C)...,0,ECHA,603-48-5
4,CN(C)C(=S)SCCCS(=O)(=O)O,0,ECHA,18880-36-9
...,...,...,...,...
1243,O=[N+]([O-])C1=CC=C(O)C=C1[N+](=O)[O-],1B,SSDB,577-71-9
1244,CCCCCCC#CC(=O)OC,1B,SSDB,111-80-8
1245,O=C1C=CC2=CC(Cl)=CC=C2O1,1B,SSDB,2051-59-4
1246,CCCCCCCC=CC=O,1B,SSDB,3913-71-1


In [6]:
df.groupby('Outcome').size()

Outcome
0     984
1A    124
1B    140
dtype: int64

In [7]:
def string_to_int(s):
    mapping = {"0": 0, "1B": 1, "1A": 2}
    return mapping.get(s, None)

df['Outcome'] = df['Outcome'].apply(string_to_int)

In [8]:
df['mol'] = [Chem.MolFromSmiles(x) for x in df['SMILES']]
fps = np.array([AllChem.GetMorganFingerprintAsBitVect(x, radius = 2, nBits = 1024, useFeatures=False) for x in df['mol']])



In [9]:
x = fps
y = df['Outcome']

x = pd.DataFrame(x)
y = pd.DataFrame(y)

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42, stratify=y)

In [11]:
from IPython.display import clear_output
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from skopt.callbacks import DeltaYStopper, DeadlineStopper
from sklearn.metrics import *
import lightgbm as lgb
import time
import warnings
warnings.filterwarnings("ignore")
import sklearn
from sklearn.preprocessing import LabelEncoder

RANDOM_STATE_SEED = 42

# Definir procedimento de avaliação
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE_SEED)

# Escolher a métrica de avaliação
scorer = 'balanced_accuracy'

# Definir o modelo LGBM para multiclasse
lgb_model = lgb.LGBMClassifier(**{
    'learning_rate': 0.03,
    'num_leaves': 31,
    'max_bin': 5,
    'num_iterations': 300,
    'min_child_samples': 10,
    'n_jobs': -1,
    'n_estimators': 200,
    'is_unbalance': True,
    'objective': 'multiclass',  # Definir para multiclasse
    'num_class': 3  # Defina o número de classes no seu problema
})

grid = {
    'max_depth': (1, 60, 'uniform'),  # Faixa mais ampla
    'max_bin': (30, 2000, 'uniform'),  # Faixa mais ampla
    'num_leaves': (20, 120, 'uniform'),  # Faixa mais ampla
    'learning_rate': (0.001, 0.5, 'log-uniform'),  # Faixa mais ampla
    'n_estimators': (30, 2000, 'uniform'),  # Faixa mais ampla
    'colsample_bytree': (0.1, 1.0, 'uniform'),  # Faixa mais ampla
    'min_child_weight': (0, 20),  # Faixa mais ampla
    'min_child_samples': (0, 30),  # Faixa mais ampla
    'max_delta_step': (0, 30),  # Faixa mais ampla
    'subsample': (0.05, 1.0, 'uniform'),  # Faixa mais ampla
}

opt = BayesSearchCV(
    lgb_model,
    grid,
    n_iter=50,  # Aumentado para explorar mais combinações
    cv=cv,
    scoring=scorer,
    random_state=RANDOM_STATE_SEED,
    verbose=0  # Definido para False para reduzir a saída
)

# Callbacks
overdone_control = DeltaYStopper(delta=0.0001)
time_limit_control = DeadlineStopper(total_time=60*60*6)  # Limite de horas

_start = time.time()
opt.fit(x_train, y_train, callback=[overdone_control, time_limit_control])

clear_output()

print(f"Tempo de execução: {(time.time() - _start )/60 } minutos")
print("val. score: %s" % opt.best_score_)
print("melhores parâmetros: %s" % str(opt.best_params_))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 768
[LightGBM] [Info] Number of data points in the train set: 798, number of used features: 384
[LightGBM] [Info] Start training from score -0.237977
[LightGBM] [Info] Start training from score -2.182299
[LightGBM] [Info] Start training from score -2.312661
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 798, number of used features: 388
[LightGBM] [Info] Start training from score -0.237977
[LightGBM] [Info] Start training from score -2.193472
[LightGBM] [Info] Start training from score -2.300082
[LightGBM] [Info] Auto-choosing col-wi

In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import math

# Perform external validation
preds = opt.best_estimator_.predict(x_test)
y = y_test

def compute_metrics(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    fp = cm[0, 1]
    Sensitivity = tp / (tp + fn) #recall
    Specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    bacc = (Sensitivity + Specificity) / 2
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    f1 = 2 * (precision * Sensitivity) / (precision + Sensitivity)
    auc = roc_auc_score(y_test, y_pred)
    return {
        'BACC': bacc,
        'F1': f1,
        'AUC': auc,
        'MCC': mcc,
        'Precision': precision,
        'Sen': Sensitivity,
        'Spe': Specificity,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn
    }

# Use the function
metrics = compute_metrics(y, preds)

# Print metrics
for metric, value in metrics.items():
    print(f"{metric}: {round(value, 2)}")

In [None]:
from collections import defaultdict
import numpy as np
import math
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold

# Define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=RANDOM_STATE_SEED)

# Define scoring functions
def compute_metrics(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred, labels=opt.best_estimator_.classes_)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    fp = cm[0, 1]
    Sensitivity = tp / (tp + fn) #recall
    Specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    bacc = (Sensitivity + Specificity) / 2
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    f1 = 2 * (precision * Sensitivity) / (precision + Sensitivity)
    auc = roc_auc_score(y_test, y_pred)
    return {
        'BACC': bacc,
        'F1': f1,
        'AUC': auc,
        'Precision': precision,
        'Sen': Sensitivity,
        'Spe': Specificity,
    }

# Perform cross-validation
results = cross_validate(opt.best_estimator_, x_train.values, y_train, scoring=compute_metrics, cv=cv)

# Aggregate metrics
average_metrics = defaultdict(list)
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(x_train.values, y_train.values)):
    x_train_fold, x_test_fold = x_train.values[train_idx], x_train.values[test_idx]
    y_train_fold, y_test_fold = y_train.values[train_idx], y_train.values[test_idx]
    model = opt.best_estimator_.fit(x_train_fold, y_train_fold)
    y_pred = model.predict(x_test_fold)
    fold_metrics = compute_metrics(y_test_fold, y_pred)
    for metric, value in fold_metrics.items():
        average_metrics[metric].append(value)

clear_output()
for metric, values in average_metrics.items():
    mean_value = np.mean(values)
    std_dev = np.std(values)
    print(f'{metric}: {mean_value:.2f} +- {std_dev:.2f}')
