In [None]:
from sklearn.model_selection import StratifiedKFold
from lazyqsar.qsar import LazyBinaryQSAR
from lazyqsar.agnostic import LazyBinaryClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import lazyqsar
import h5py
import tqdm
import os

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."

# Load data
data = pd.read_csv(os.path.join(root, "..", "output", "mtuberculosis", "datasets", "CHEMBL4649948_IC50_umol.L-1_perc_1.csv.gz"))
X = data['canonical_smiles'].astype(str).tolist()[:500]
Y = data['bin'].astype(np.int8).to_numpy()[:500]

# Define stratified 5 fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Load descriptors
with h5py.File(os.path.join(root, "..", "output", "mtuberculosis", "descriptors.h5"), "r") as f:
    SMILES = f['SMILES'][:]
    X_Morgan = f['X_Morgan'][:]

# Define dict mapping smiles to morgan fingerprints
SMILES_TO_MORGAN = {
    smiles.decode("utf-8"): fp
    for (chembl_id, smiles), fp in zip(SMILES, X_Morgan)}

# For each split
AUROCS = []
for train, test in skf.split(X, Y):
    print(len(train), len(test))

    # Get train/test indices
    X_train, Y_train = [X[i] for i in train], Y[train]
    X_test, Y_test = [X[i] for i in test], Y[test]

    # Get Morgan Fingerprints and train model
    X_train = np.array([SMILES_TO_MORGAN[smi] for smi in X_train])
    X_test  = np.array([SMILES_TO_MORGAN[smi] for smi in X_test])
    model = LazyBinaryClassifier(mode="fast")
    model.fit(X=X_train, y=Y_train)

    # Store results
    AUROCS.append(roc_auc_score(Y_test, model.predict_proba(X_test)[:, 1]))
    break

# Train full model
X = np.array([SMILES_TO_MORGAN[smi] for smi in X])
model = LazyBinaryClassifier(mode="fast")
model.fit(X=X, y=Y)

# Save model
model.save(os.path.join(root, "..", "output", "mtuberculosis", "models", "CHEMBL4649948_IC50_umol.L-1_perc_1.zip"))

# Save CV results
with open(os.path.join(root, "..", "output", "mtuberculosis", "models", "CHEMBL4649948_IC50_umol.L-1_perc_1.csv"), "w") as outfile:
    outfile.write(",".join([str(round(i, 3)) for i in AUROCS]))

  from .autonotebook import tqdm as notebook_tqdm


400 100


  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.695175: 100%|██████████| 1/1 [00:00<00:00, 40.03it/s]

[I 2025-12-04 14:40:02,955] Trial 0 finished with value: 0.6951754385964911 and parameters: {'k_features': 121, 'alpha': 0.0001}. Best is trial 0 with value: 0.6951754385964911.





[I 2025-12-04 14:40:02,984] A new study created in memory with name: no-name-9d154494-b3b9-4720-ab4e-867c59b0b493
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.741127: 100%|██████████| 1/1 [00:02<00:00,  2.32s/it]

[I 2025-12-04 14:40:05,291] Trial 0 finished with value: 0.741126965140123 and parameters: {'threshold': '1.5*mean'}. Best is trial 0 with value: 0.741126965140123.





[I 2025-12-04 14:40:08,885] A new study created in memory with name: no-name-899b8fc2-6f49-473d-9982-313be5bea2ae
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.678728: 100%|██████████| 1/1 [00:00<00:00, 20.14it/s]

[I 2025-12-04 14:40:08,936] Trial 0 finished with value: 0.6787280701754387 and parameters: {'n_components': 209, 'alpha': 0.0001}. Best is trial 0 with value: 0.6787280701754387.





[I 2025-12-04 14:40:09,002] A new study created in memory with name: no-name-6f25d6e7-9a2e-45fd-a1b3-be2069599e96


[I 2025-12-04 14:40:30,029] Trial 0 finished with value: 0.6162453377538334 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6162453377538334.


[I 2025-12-04 14:40:30,040] A new study created in memory with name: no-name-faabf4fd-cf1c-434e-8466-36effc5eddb3


[I 2025-12-04 14:40:30,076] Trial 0 finished with value: 0.7639176681862134 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7639176681862134.


[I 2025-12-04 14:40:30,094] A new study created in memory with name: no-name-732b970a-c0e8-4446-a0a3-d047366b4052
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.745614: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]

[I 2025-12-04 14:40:31,262] Trial 0 finished with value: 0.7456140350877192 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7456140350877192.





[I 2025-12-04 14:40:31,291] A new study created in memory with name: no-name-ea09ed6e-ae21-4d4d-9e35-08870ef6ee5a


[I 2025-12-04 14:40:32,718] Trial 0 finished with value: 0.8354745130542892 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.8354745130542892.


[I 2025-12-04 14:40:32,724] A new study created in memory with name: no-name-3e90a681-967b-4ff6-9ab6-1e1026ea7c78


[I 2025-12-04 14:40:32,735] Trial 0 finished with value: 0.8954275452410554 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.8954275452410554.


[I 2025-12-04 14:40:32,747] A new study created in memory with name: no-name-8c7632fc-894f-4f9e-aede-6d733dcdca5e


[I 2025-12-04 14:40:36,378] Trial 0 finished with value: 0.7119767923746374 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7119767923746374.


[I 2025-12-04 14:40:36,385] A new study created in memory with name: no-name-2cafb342-9c31-4202-a97c-3eaae388118b


[I 2025-12-04 14:40:36,403] Trial 0 finished with value: 0.7440254178753971 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7440254178753971.


[I 2025-12-04 14:40:36,417] A new study created in memory with name: no-name-8d70a480-351e-4918-b761-dfb25a445d98


[I 2025-12-04 14:40:38,760] Trial 0 finished with value: 0.6509186351706038 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6509186351706038.


[I 2025-12-04 14:40:38,767] A new study created in memory with name: no-name-7aa60f4d-dfb8-4cb3-9e6e-1c25063e5a7e


[I 2025-12-04 14:40:38,804] Trial 0 finished with value: 0.6969194640143666 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6969194640143666.


[I 2025-12-04 14:40:38,824] A new study created in memory with name: no-name-cd2d0faf-fb3b-4bd7-9b87-045c275b8d2c


[I 2025-12-04 14:40:39,408] Trial 0 finished with value: 0.8717105263157895 and parameters: {'n_hidden': 1, 'scale1': 0.5, 'scale2': 0.5, 'dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.8717105263157895.


Predicting chunks...: 0it [00:00, ?it/s]

Predicting chunks...: 1it [00:00, 34.55it/s]


[I 2025-12-04 14:41:19,556] A new study created in memory with name: no-name-b9b87a4b-a63b-40a1-85bc-b240aacf9bca
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.719298: 100%|██████████| 1/1 [00:00<00:00, 49.22it/s]

[I 2025-12-04 14:41:19,575] Trial 0 finished with value: 0.7192982456140351 and parameters: {'k_features': 117, 'alpha': 0.0001}. Best is trial 0 with value: 0.7192982456140351.





[I 2025-12-04 14:41:19,619] A new study created in memory with name: no-name-035cdda0-f128-42ea-af7e-a5c8f8694b40
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.654992: 100%|██████████| 1/1 [00:01<00:00,  1.76s/it]

[I 2025-12-04 14:41:21,372] Trial 0 finished with value: 0.6549917763157895 and parameters: {'threshold': '2*mean'}. Best is trial 0 with value: 0.6549917763157895.





[I 2025-12-04 14:41:23,167] A new study created in memory with name: no-name-73f0cb73-58c2-479e-a683-5e07f2f89e18
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.641404: 100%|██████████| 1/1 [00:00<00:00, 40.24it/s]

[I 2025-12-04 14:41:23,191] Trial 0 finished with value: 0.6414035087719298 and parameters: {'n_components': 250, 'alpha': 0.0001}. Best is trial 0 with value: 0.6414035087719298.





[I 2025-12-04 14:41:23,247] A new study created in memory with name: no-name-c57df0e7-6959-4d25-9f92-715c4f040b66


[I 2025-12-04 14:41:51,063] Trial 0 finished with value: 0.5175070028011204 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.5175070028011204.


[I 2025-12-04 14:41:51,070] A new study created in memory with name: no-name-a0391392-96be-4041-8bd1-d3d1a314ec7f


[I 2025-12-04 14:41:51,109] Trial 0 finished with value: 0.5750175070028012 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.5750175070028012.


[I 2025-12-04 14:41:51,115] A new study created in memory with name: no-name-b4cdff99-8c52-4d1c-88a4-e5975210fd51
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.593842: 100%|██████████| 1/1 [00:00<00:00,  1.23it/s]

[I 2025-12-04 14:41:51,918] Trial 0 finished with value: 0.5938415570175438 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5938415570175438.





[I 2025-12-04 14:41:51,942] A new study created in memory with name: no-name-4624f858-bb02-4aad-af3c-2e415a52cc6a


[I 2025-12-04 14:41:53,608] Trial 0 finished with value: 0.6369922969187675 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6369922969187675.


[I 2025-12-04 14:41:53,615] A new study created in memory with name: no-name-bc673f1b-ab4b-43d6-b144-b1eaf2575e3b


[I 2025-12-04 14:41:53,630] Trial 0 finished with value: 0.8751750700280112 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.8751750700280112.


[I 2025-12-04 14:41:53,643] A new study created in memory with name: no-name-08cc830e-6332-4231-88c9-9e1ea569fb0b


[I 2025-12-04 14:41:57,678] Trial 0 finished with value: 0.5416666666666666 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.5416666666666666.


[I 2025-12-04 14:41:57,686] A new study created in memory with name: no-name-e74f7189-be1a-4425-8a1d-83c59b9d7290


[I 2025-12-04 14:41:57,707] Trial 0 finished with value: 0.6502976190476191 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6502976190476191.


[I 2025-12-04 14:41:57,716] A new study created in memory with name: no-name-b4ee7d4a-24c8-4682-9837-0da0f8c8b6a6


[I 2025-12-04 14:42:01,073] Trial 0 finished with value: 0.5132177871148459 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.5132177871148459.


[I 2025-12-04 14:42:01,079] A new study created in memory with name: no-name-e4305731-2d2e-4e88-9452-99fdb2027a78


[I 2025-12-04 14:42:01,099] Trial 0 finished with value: 0.6475840336134454 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6475840336134454.


[I 2025-12-04 14:42:01,103] A new study created in memory with name: no-name-8c54229e-0924-400b-8ada-640233d6265c


[I 2025-12-04 14:42:01,621] Trial 0 finished with value: 0.791578947368421 and parameters: {'n_hidden': 1, 'scale1': 0.5, 'scale2': 0.5, 'dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.791578947368421.


'./../output/mtuberculosis/models/CHEMBL4649948_IC50_umol.L-1_perc_1.zip'