In [None]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

df = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1345082 3T3\CURAGEM\curated_binary.csv")
df

In [None]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

x = pd.DataFrame(x)
y = pd.DataFrame(y)

In [None]:
df.groupby('Outcome').size()

In [None]:
df['mol'] = [Chem.MolFromSmiles(x) for x in df['SMILES']]
fps = np.array([AllChem.GetMorganFingerprintAsBitVect(x, radius = 2, nBits = 1024, useFeatures=False) for x in df['mol']])

In [None]:
x = fps
y = df['Outcome']

x = pd.DataFrame(x)
y = pd.DataFrame(y)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42, stratify=y)

In [None]:
from IPython.display import clear_output
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.metrics import *
import xgboost
import lightgbm as lgb
import time
import warnings
warnings.filterwarnings("ignore")
import sklearn

RANDOM_STATE_SEED = 42

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=RANDOM_STATE_SEED)

scorer = 'balanced_accuracy'

lgb_model = lgb.LGBMClassifier(**{
     'learning_rate': 0.03,
     'num_leaves': 31,
     'max_bin': 5,
     'num_iterations': 300,
     'min_child_samples': 10,
     'objective': 'binary',
     'n_jobs': -1,
     'scale_pos_weight':  1,
     'n_estimators':200})

grid= {
        'max_depth': (1, 40, 'uniform'),
        'max_bin': (50, 1500, 'uniform'),
        'num_leaves': (31, 80, 'uniform'),
        'learning_rate': (0.001, 0.35, 'log-uniform'),
        'n_estimators':  (50, 1500, 'uniform'),
        'colsample_bytree': (0.2, 1.0, 'uniform'), 
        'min_child_weight': (1, 10),
        'min_child_samples': (1, 20),
        'max_delta_step': (1, 20),
        'subsample': (0.1, 1.0, 'uniform'),       
        'alpha': (1, 5, 'uniform'),  
        'scale_pos_weight': (1.0, 2.0, 'uniform'),  
    }

opt = BayesSearchCV(
   lgb_model,
   grid,
    n_iter= 12,
    cv=cv,
    scoring=scorer, 
    random_state= RANDOM_STATE_SEED,
    verbose= False
)

_start = time.time()
opt.fit(x_train, y_train)

clear_output()

print(f"Execution time: {  (time.time() - _start )/60 }")

print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))

In [None]:
from collections import defaultdict
import numpy as np
import math
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold

# Define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=RANDOM_STATE_SEED)

# Define scoring functions
def compute_metrics(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred, labels=opt.best_estimator_.classes_)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    fp = cm[0, 1]
    Sensitivity = tp / (tp + fn) #recall
    Specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    bacc = (Sensitivity + Specificity) / 2
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    f1 = 2 * (precision * Sensitivity) / (precision + Sensitivity)
    auc = roc_auc_score(y_test, y_pred)
    return {
        'BACC': bacc,
        'F1': f1,
        'AUC': auc,
        'Precision': precision,
        'Sen': Sensitivity,
        'Spe': Specificity,
    }

# Perform cross-validation
results = cross_validate(opt.best_estimator_, x_train.values, y_train, scoring=compute_metrics, cv=cv)

# Aggregate metrics
average_metrics = defaultdict(list)
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(x_train.values, y_train.values)):
    x_train_fold, x_test_fold = x_train.values[train_idx], x_train.values[test_idx]
    y_train_fold, y_test_fold = y_train.values[train_idx], y_train.values[test_idx]
    model = opt.best_estimator_.fit(x_train_fold, y_train_fold)
    y_pred = model.predict(x_test_fold)
    fold_metrics = compute_metrics(y_test_fold, y_pred)
    for metric, value in fold_metrics.items():
        average_metrics[metric].append(value)

clear_output()
for metric, values in average_metrics.items():
    mean_value = np.mean(values)
    std_dev = np.std(values)
    print(f'{metric}: {mean_value:.2f} +- {std_dev:.2f}')


In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd

# Perform external validation
preds = opt.best_estimator_.predict(x_test)
y = y_test

def compute_metrics(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    fp = cm[0, 1]
    Sensitivity = tp / (tp + fn) #recall
    Specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    bacc = (Sensitivity + Specificity) / 2
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    f1 = 2 * (precision * Sensitivity) / (precision + Sensitivity)
    auc = roc_auc_score(y_test, y_pred)
    return {
        'BACC': bacc,
        'F1': f1,
        'AUC': auc,
        'MCC': mcc,
        'Precision': precision,
        'Sen': Sensitivity,
        'Spe': Specificity,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn
    }

# Use the function
metrics = compute_metrics(y, preds)

# Print metrics
for metric, value in metrics.items():
    print(f"{metric}: {round(value, 2)}")