In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import lazyqsar
import pickle
import h5py
import tqdm
import sys
import os

# alpha = int(sys.argv[1])

# Define root directory
root = "."

# Load pickle
# pathogen_code, file, column = pickle.load(open(os.path.join(root, "..", "..", "tmp", "models_to_train.pkl"), "rb"))[alpha]
pathogen_code = 'mtuberculosis'
file = "CHEMBL4649971_PERCENTEFFECT_%"
file = "CHEMBL4649971_IC50_umol.L-1"
file = "CHEMBL1794426_EC50_umol.L-1"
file = "CHEMBL4649947_PERCENTEFFECT_%"

In [2]:
# Load data
sys.stderr.write(str(pathogen_code) + "\n" + str(file))
sys.stderr.write("\n\n")
sys.stderr.flush()
data = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "datasets", f"{file}.csv.gz"))
X = data['canonical_smiles'].astype(str).tolist()
Y = data['bin'].astype(np.int8).to_numpy()

# Define stratified 5 fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Load descriptors
sys.stderr.write("Loading descriptors...")
sys.stderr.write("\n\n")
sys.stderr.flush()
with h5py.File(os.path.join(root, "..", "output", pathogen_code, "descriptors.h5"), "r") as f:
    SMILES = f['SMILES'][:]
    X_morgan = f['X_morgan'][:]
    ### add here rdkit descriptors

# Define dict mapping smiles to morgan fingerprints
SMILES_TO_MORGAN = {
    smiles.decode("utf-8"): fp
    for (chembl_id, smiles), fp in zip(SMILES, X_morgan)}

mtuberculosis
CHEMBL4649947_PERCENTEFFECT_%

Loading descriptors...



In [3]:
from lazyqsar.agnostic import LazyBinaryClassifier

# For each split
AUROCS = []
for c, (train, test) in enumerate(skf.split(X, Y)):
    
    sys.stderr.write(f"Training split {c}...")
    sys.stderr.write("\n")
    sys.stderr.flush()

    # Get train/test indices
    X_train, Y_train = [X[i] for i in train], Y[train]
    X_test, Y_test = [X[i] for i in test], Y[test]

    # Get Morgan Fingerprints and train model
    X_train = np.array([SMILES_TO_MORGAN[smi] for smi in X_train])
    X_test  = np.array([SMILES_TO_MORGAN[smi] for smi in X_test])
    model = LazyBinaryClassifier(mode="fast")
    model.fit(X=X_train, y=Y_train)

    break



  from .autonotebook import tqdm as notebook_tqdm
Training split 0...


  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.936306: 100%|██████████| 1/1 [00:00<00:00, 15.17it/s]

[I 2025-12-12 12:21:57,859] Trial 0 finished with value: 0.9363057324840764 and parameters: {'k_features': 139, 'alpha': 0.0001}. Best is trial 0 with value: 0.9363057324840764.





[I 2025-12-12 12:21:57,998] A new study created in memory with name: no-name-aee1281d-c4d1-4697-afaa-a0e09861ac0f
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.737434: 100%|██████████| 1/1 [00:02<00:00,  2.48s/it]

[I 2025-12-12 12:22:00,474] Trial 0 finished with value: 0.7374337896385045 and parameters: {'threshold': '1.5*mean'}. Best is trial 0 with value: 0.7374337896385045.





[I 2025-12-12 12:22:14,546] A new study created in memory with name: no-name-86656501-12f0-455d-899a-8ad5443f0446
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.9021: 100%|██████████| 1/1 [00:00<00:00,  7.38it/s]

[I 2025-12-12 12:22:14,680] Trial 0 finished with value: 0.9020995517810805 and parameters: {'n_components': 513, 'alpha': 0.0001}. Best is trial 0 with value: 0.9020995517810805.





[I 2025-12-12 12:22:15,093] A new study created in memory with name: no-name-14fba4c1-d500-47d2-822d-33d7c6a66dd5


[I 2025-12-12 12:28:37,953] Trial 0 finished with value: 0.6979250239904353 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6979250239904353.


[I 2025-12-12 12:28:37,990] A new study created in memory with name: no-name-c36d9134-3dd1-45da-b444-9d48aebe76d1


[I 2025-12-12 12:28:38,287] Trial 0 finished with value: 0.941746503689021 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.941746503689021.


[I 2025-12-12 12:28:38,296] A new study created in memory with name: no-name-9996ec40-b333-42be-8af1-c272e3efde74
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.812257: 100%|██████████| 1/1 [00:01<00:00,  1.74s/it]

[I 2025-12-12 12:28:40,032] Trial 0 finished with value: 0.812257237028082 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.812257237028082.





[I 2025-12-12 12:28:40,075] A new study created in memory with name: no-name-5efce350-73ca-4363-9070-6b59db56794d


[I 2025-12-12 12:29:09,177] Trial 0 finished with value: 0.6832790598895653 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6832790598895653.


[I 2025-12-12 12:29:09,187] A new study created in memory with name: no-name-95ca6d37-e7dc-44f1-899d-54aeb52375b1


[I 2025-12-12 12:29:09,250] Trial 0 finished with value: 0.9548193244922679 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.9548193244922679.


[I 2025-12-12 12:29:09,256] A new study created in memory with name: no-name-9133aa57-f486-4ee5-acfd-98e6d12a7f91


[I 2025-12-12 12:29:51,131] Trial 0 finished with value: 0.5486022621800619 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.5486022621800619.


[I 2025-12-12 12:29:51,136] A new study created in memory with name: no-name-a296d1f2-9272-4fc1-ad35-4a00b9f6bb61


[I 2025-12-12 12:29:51,213] Trial 0 finished with value: 0.9027325499079711 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.9027325499079711.


[I 2025-12-12 12:29:51,224] A new study created in memory with name: no-name-15f5fe23-96b7-47de-9a39-3befcced2cd9


[I 2025-12-12 12:31:27,632] Trial 0 finished with value: 0.5784054619535294 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.5784054619535294.


[I 2025-12-12 12:31:27,641] A new study created in memory with name: no-name-8763baf6-e437-4f13-865a-62d311fee197


[I 2025-12-12 12:31:28,172] Trial 0 finished with value: 0.930514260543993 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.930514260543993.


[I 2025-12-12 12:31:28,182] A new study created in memory with name: no-name-aede7da2-bfe8-4206-8611-27a76878c67b


[I 2025-12-12 12:31:38,331] Trial 0 finished with value: 0.7643312101910827 and parameters: {'n_hidden': 1, 'scale1': 0.5, 'scale2': 0.5, 'dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.7643312101910827.


In [4]:
#     # Store results
#     auroc = round(roc_auc_score(Y_test, model.predict_proba(X_test)[:, 1]), 3)
#     AUROCS.append(auroc)
#     sys.stderr.write(f"AUROC: {auroc}")
#     sys.stderr.write("\n\n")
#     sys.stderr.flush()

# # Train full model
# X = np.array([SMILES_TO_MORGAN[smi] for smi in X])
# model = LazyBinaryClassifier(mode="default")
# model.fit(X=X, y=Y)

# # Save model
# model.save(os.path.join(root, "..", "..", "output", pathogen_code, "models", file.replace(".csv.gz", ".zip")))

# # Save CV results
# with open(os.path.join(root, "..", "..", "output", pathogen_code, "models", file.replace('.gz', '')), "w") as outfile:
#     outfile.write(",".join([str(i) for i in AUROCS]))