In [37]:
from sklearn.model_selection import StratifiedKFold
from lazyqsar.qsar import LazyBinaryQSAR
from lazyqsar.agnostic import LazyBinaryClassifier
import pandas as pd
import numpy as np
import lazyqsar
import h5py
import tqdm
import os

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."

# Load data
data = pd.read_csv(os.path.join(root, "..", "output", "mtuberculosis", "datasets", "CHEMBL4649948_IC50_umol.L-1_perc_1.csv.gz"))
X = data['canonical_smiles'].tolist()h
Y = data['bin'].tolist()
X = np.array(X, dtype='str')
Y = np.array(Y, dtype=np.int8)

# Define stratified 5 fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Load descriptors
with h5py.File(os.path.join(root, "..", "output", "mtuberculosis", "descriptors.h5"), "r") as f:
    SMILES = f['SMILES'][:]
    X_Morgan = f['X_Morgan'][:]

# Define dict mapping smiles to morgan fingerprints
SMILES_TO_MORGAN = {i[1].decode('utf-8'): j for i,j in zip(SMILES, X_Morgan)}

# For each split
for train, test in skf.split(X, Y):
    print(len(train), len(test))

    # Get train/test indices
    X_train, Y_train = X[train], Y[train]
    X_test, Y_test = X[test], Y[test]

    # Get Morgan Fingerprints and train model
    X_train = np.array([SMILES_TO_MORGAN[i] for i in X_train])
    X_test = np.array([SMILES_TO_MORGAN[i] for i in X_test])
    model = LazyBinaryClassifier(mode="fast")
    model.fit(X=X_train, y=Y_train)

    break

1972 494


[I 2025-12-03 14:55:42,747] A new study created in memory with name: no-name-28e9a5db-d88b-418f-8ae8-9b0d064520d3
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.748227: 100%|██████████| 1/1 [00:00<00:00, 39.74it/s]

[I 2025-12-03 14:55:42,770] Trial 0 finished with value: 0.74822695035461 and parameters: {'k_features': 116, 'alpha': 0.0001}. Best is trial 0 with value: 0.74822695035461.





[I 2025-12-03 14:55:42,834] A new study created in memory with name: no-name-255c0da1-8f74-4c7d-b960-86de186532db
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.755499: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]

[I 2025-12-03 14:55:44,901] Trial 0 finished with value: 0.7554991581541471 and parameters: {'threshold': '1.25*mean'}. Best is trial 0 with value: 0.7554991581541471.





[I 2025-12-03 14:55:54,591] A new study created in memory with name: no-name-f01017f0-0794-4e29-a067-067fc2d0c8ef
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.756206: 100%|██████████| 1/1 [00:00<00:00,  7.67it/s]

[I 2025-12-03 14:55:54,720] Trial 0 finished with value: 0.7562056737588652 and parameters: {'n_components': 513, 'alpha': 0.0001}. Best is trial 0 with value: 0.7562056737588652.





[I 2025-12-03 14:55:54,854] A new study created in memory with name: no-name-0eeb4fe5-b5fa-4964-a70f-1a17ec4fc5e4


[I 2025-12-03 14:57:34,966] Trial 0 finished with value: 0.6439743697361426 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6439743697361426.


[I 2025-12-03 14:57:34,983] A new study created in memory with name: no-name-81e3fcdf-87c0-4846-83ec-38c1362c5aef


[I 2025-12-03 14:57:35,098] Trial 0 finished with value: 0.7813033380900417 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7813033380900417.


[I 2025-12-03 14:57:35,108] A new study created in memory with name: no-name-61934cf9-05fb-419f-99b1-7776df34c1de
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.839212: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]

[I 2025-12-03 14:57:36,216] Trial 0 finished with value: 0.8392116748499727 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8392116748499727.





[I 2025-12-03 14:57:36,251] A new study created in memory with name: no-name-79865659-acc9-408f-b7df-ef8af0bffb18


[I 2025-12-03 14:57:42,418] Trial 0 finished with value: 0.7770782618428048 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7770782618428048.


[I 2025-12-03 14:57:42,426] A new study created in memory with name: no-name-a275f683-fb84-4ba4-bc68-a8202451d802


[I 2025-12-03 14:57:42,448] Trial 0 finished with value: 0.9201712415008814 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.9201712415008814.


[I 2025-12-03 14:57:42,456] A new study created in memory with name: no-name-763bb7b2-e561-42fa-a39f-32fc501248be


[I 2025-12-03 14:57:59,997] Trial 0 finished with value: 0.6452894596938918 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6452894596938918.


[I 2025-12-03 14:58:00,006] A new study created in memory with name: no-name-9643b407-05ad-41db-9aec-3c316981ffed


[I 2025-12-03 14:58:00,035] Trial 0 finished with value: 0.7647388007498811 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7647388007498811.


[I 2025-12-03 14:58:00,044] A new study created in memory with name: no-name-fa40ea53-790a-489d-9788-8ed39c635809


[I 2025-12-03 14:58:25,411] Trial 0 finished with value: 0.6163574806234086 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6163574806234086.


[I 2025-12-03 14:58:25,417] A new study created in memory with name: no-name-11b98a22-73c4-42fb-98a2-328e4f690bcf


[I 2025-12-03 14:58:25,506] Trial 0 finished with value: 0.6828394750832424 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6828394750832424.


[I 2025-12-03 14:58:25,510] A new study created in memory with name: no-name-a214b91e-b5a8-4fca-967b-81121253a89e


[I 2025-12-03 14:58:28,595] Trial 0 finished with value: 0.6090425531914894 and parameters: {'n_hidden': 1, 'scale1': 0.5, 'scale2': 0.5, 'dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.6090425531914894.


[I 2025-12-03 15:01:14,914] A new study created in memory with name: no-name-494b1c2e-74a2-4d90-ae8c-f9f8c01674a3
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.766179: 100%|██████████| 1/1 [00:00<00:00, 41.08it/s]

[I 2025-12-03 15:01:14,937] Trial 0 finished with value: 0.7661790780141845 and parameters: {'k_features': 115, 'alpha': 0.0001}. Best is trial 0 with value: 0.7661790780141845.





[I 2025-12-03 15:01:14,980] A new study created in memory with name: no-name-ae2104ef-3107-48ed-879b-1d85d3cbcc6d
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.788341: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]

[I 2025-12-03 15:01:16,088] Trial 0 finished with value: 0.7883409052429594 and parameters: {'threshold': 'mean'}. Best is trial 0 with value: 0.7883409052429594.





[I 2025-12-03 15:01:26,823] A new study created in memory with name: no-name-432a3202-be5f-4647-ad94-714024812d68
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.777039: 100%|██████████| 1/1 [00:00<00:00, 12.52it/s]

[I 2025-12-03 15:01:26,899] Trial 0 finished with value: 0.7770390070921986 and parameters: {'n_components': 513, 'alpha': 0.0001}. Best is trial 0 with value: 0.7770390070921986.





[I 2025-12-03 15:01:27,043] A new study created in memory with name: no-name-bbb2e45e-0538-4142-99a1-f43df3ca7a35


[I 2025-12-03 15:03:09,119] Trial 0 finished with value: 0.6061025770167044 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6061025770167044.


[I 2025-12-03 15:03:09,130] A new study created in memory with name: no-name-ffacc2ab-9623-4c55-9c32-845607009d7c


[I 2025-12-03 15:03:09,197] Trial 0 finished with value: 0.7841573631047315 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7841573631047315.


[I 2025-12-03 15:03:09,207] A new study created in memory with name: no-name-c4b8894a-0532-4664-9a81-2d40db78f726
  0%|          | 0/1 [00:00<?, ?it/s]

Best trial: 0. Best value: 0.818016: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]

[I 2025-12-03 15:03:10,316] Trial 0 finished with value: 0.8180156188272477 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8180156188272477.





[I 2025-12-03 15:03:10,356] A new study created in memory with name: no-name-32a6ef8d-79f8-4993-b0a4-59eb1887062e


[I 2025-12-03 15:03:16,577] Trial 0 finished with value: 0.7142337502448305 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7142337502448305.


[I 2025-12-03 15:03:16,587] A new study created in memory with name: no-name-38ddeb82-4b0d-4ab0-91ff-e56d525f42ff


[I 2025-12-03 15:03:16,611] Trial 0 finished with value: 0.9275861104115951 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.9275861104115951.


[I 2025-12-03 15:03:16,620] A new study created in memory with name: no-name-c740fa27-41d8-4162-b70a-5dcc2b987cdc


[I 2025-12-03 15:03:35,349] Trial 0 finished with value: 0.7165701334676403 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7165701334676403.


[I 2025-12-03 15:03:35,356] A new study created in memory with name: no-name-bd125942-26aa-4450-b069-06233fec8910


[I 2025-12-03 15:03:35,394] Trial 0 finished with value: 0.7676208064019698 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7676208064019698.


[I 2025-12-03 15:03:35,402] A new study created in memory with name: no-name-e8df6c6a-7ed1-45ec-b276-e7967e4283fb


[I 2025-12-03 15:04:00,586] Trial 0 finished with value: 0.6514032289655558 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.6514032289655558.


[I 2025-12-03 15:04:00,594] A new study created in memory with name: no-name-d0a8b3ae-b5b6-4c6e-872e-da98399b3a20


[I 2025-12-03 15:04:00,708] Trial 0 finished with value: 0.7131984666610705 and parameters: {'C': 1.0}. Best is trial 0 with value: 0.7131984666610705.


[I 2025-12-03 15:04:00,713] A new study created in memory with name: no-name-15877b74-79b6-47ce-916c-aa90c5538748


[I 2025-12-03 15:04:03,856] Trial 0 finished with value: 0.6502659574468085 and parameters: {'n_hidden': 1, 'scale1': 0.5, 'scale2': 0.5, 'dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.6502659574468085.
