In [None]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

df = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1345082 3T3\CURAGEM\curated_binary.csv")
df

In [None]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

x = pd.DataFrame(x)
y = pd.DataFrame(y)

In [None]:
df.groupby('Outcome').size()

In [4]:
df['mol'] = [Chem.MolFromSmiles(x) for x in df['SMILES']]
fps = np.array([AllChem.GetMorganFingerprintAsBitVect(x, radius = 2, nBits = 1024, useFeatures=False) for x in df['mol']])

In [5]:
x = fps
y = df['Outcome']

x = pd.DataFrame(x)
y = pd.DataFrame(y)

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42, stratify=y)

In [7]:
# fit balanced xgboost on an imbalanced classification dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.metrics import *
import xgboost
import lightgbm as lgb
import asyncio
import time
import concurrent.futures
RANDOM_STATE_SEED = 42
import warnings
warnings.filterwarnings("ignore")
import sklearn 
import sklearn.gaussian_process

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=RANDOM_STATE_SEED)

scorer = 'balanced_accuracy'

lgb_model = lgb.LGBMClassifier(**{
     'learning_rate': 0.03,
     'num_leaves': 31,
     'max_bin': 5,
     'num_iterations': 300,
     'min_child_samples': 10,
     'objective': 'binary',
     'n_jobs': -1,
     'scale_pos_weight':  1,
     'n_estimators':200})

grid= {
        'max_depth': (1, 40, 'uniform'),
        'max_bin': (50, 1500, 'uniform'),
        'num_leaves': (31, 80, 'uniform'),
        'learning_rate': (0.001, 0.35, 'log-uniform'),
        'n_estimators':  (50, 1500, 'uniform'),
        'colsample_bytree': (0.2, 1.0, 'uniform'), 
        'min_child_weight': (1, 10),
        'min_child_samples': (1, 20),
        'max_delta_step': (1, 20),
        'subsample': (0.1, 1.0, 'uniform'),       
        'alpha': (1, 5, 'uniform'),  
        'scale_pos_weight': (1.0, 2.0, 'uniform'),  # integer valued parameter #scale definido por estar com classe 1 maior que 0
    }

opt = BayesSearchCV(
   lgb_model,
   grid,
    n_iter= 12,
    cv=cv,
    scoring=scorer, 
    random_state= RANDOM_STATE_SEED,
    verbose= False
)

_start = time.time()
opt.fit(x_train, y_train)

print(f"Execution time: {  (time.time() - _start )/60 }")

print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))

[LightGBM] [Info] Number of positive: 2564, number of negative: 40072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 42636, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060137 -> initscore=-2.749109
[LightGBM] [Info] Start training from score -2.749109
[LightGBM] [Info] Number of positive: 2565, number of negative: 40072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028357 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 42637, number of used features: 1024
[LightGBM] [Info

In [9]:
from IPython.display import clear_output
from collections import defaultdict
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from imblearn.metrics import sensitivity_score, specificity_score
from imblearn.metrics import classification_report_imbalanced

# Define evaluation procedure
#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=RANDOM_STATE_SEED)

# Define scoring functions
def compute_metrics(y_test, y_pred):
    return {
        'BAAC': balanced_accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted'),
        'ROC AUC': roc_auc_score(y_test, y_pred),
        'Sensitivity': sensitivity_score(y_test, y_pred, average='weighted'),
        'Specificity': specificity_score(y_test, y_pred, average='weighted'),
    }

# Perform cross-validation
results = cross_validate(opt.best_estimator_, x_train.values, y_train, scoring=compute_metrics, cv=cv)

# Aggregate metrics
average_metrics = defaultdict(list)
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(x_train.values, y_train.values)):
    x_train_fold, x_test_fold = x_train.values[train_idx], x_train.values[test_idx]
    y_train_fold, y_test_fold = y_train.values[train_idx], y_train.values[test_idx]
    model = opt.best_estimator_.fit(x_train_fold, y_train_fold)
    y_pred = model.predict(x_test_fold)
    fold_metrics = compute_metrics(y_test_fold, y_pred)
    for metric, value in fold_metrics.items():
        average_metrics[metric].append(value)

clear_output()
for metric, values in average_metrics.items():
    mean_value = np.mean(values)
    std_dev = np.std(values)
    print(f'{metric}: {mean_value:.2f} +- {std_dev:.2f}')

BAAC: 0.81 +- 0.01
precision: 0.96 +- 0.00
recall: 0.96 +- 0.00
F1 Score: 0.96 +- 0.00
ROC AUC: 0.81 +- 0.01
Sensitivity: 0.96 +- 0.00
Specificity: 0.65 +- 0.03


In [17]:
from sklearn.metrics import confusion_matrix
import pandas as pd

# Perform external validation
preds = opt.best_estimator_.predict(x_test)
y = y_test

# Calculate metrics
balanced_accuracy = round(balanced_accuracy_score(y, preds), 2)
f1 = round(f1_score(y, preds, average='weighted'), 2)
roc_auc = round(roc_auc_score(y, preds, multi_class='ovr'), 2)
mcc = round(matthews_corrcoef(y, preds), 2)
precision = round(precision_score(y, preds, average='weighted'), 2)

# Calculate confusion matrix
cm = confusion_matrix(y, preds)

# Calculate sensitivity and specificity
sensitivity = round(cm[1, 1] / (cm[1, 1] + cm[1, 0]), 2)  # Corrected
specificity = round(cm[0, 0] / (cm[0, 0] + cm[0, 1]), 2)  # Unchanged

clear_output()
# Print metrics
print(f"Balanced Accuracy: {balanced_accuracy}")
print(f"F1-score: {f1}")
print(f"AUC: {roc_auc}")
print(f"MCC: {mcc}")
print(f"Precision: {precision}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")

# Print confusion matrix
print("\nConfusion Matrix:")
# Assign TP, FP, TN, FN
TP = cm[1, 1]
FP = cm[0, 1]
TN = cm[0, 0]
FN = cm[1, 0]

# Print TP, FP, TN, FN
print(f"True Positives: {TP}")
print(f"False Positives: {FP}")
print(f"True Negatives: {TN}")
print(f"False Negatives: {FN}")


Balanced Accuracy: 0.82
F1-score: 0.97
AUC: 0.82
MCC: 0.68
Precision: 0.96
Sensitivity: 0.66
Specificity: 0.99

Confusion Matrix:
True Positives: 526
False Positives: 177
True Negatives: 12346
False Negatives: 275
