In [1]:
from sklearn.model_selection import StratifiedKFold
from lazyqsar.models import LazyRandomForestBinaryClassifier
from lazyqsar.models import LazyLogisticRegressionBinaryClassifier
import lazyqsar
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import collections
import joblib
import os

# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))

# Define some paths
PATH_TO_FEATURES = os.path.join("..", "output", "02_features")
PATH_TO_OUTPUT = os.path.join("..", "output", "03_baseline_models")

In [2]:
for pathogen in PATHOGENS:

    print(f"----------------------- PATHOGEN: {pathogen} ---------------------------")

    # Get list of tasks
    tasks = sorted(os.listdir(os.path.join("..", "data", pathogen)))

    # Get IK to MFP
    IKs = open(os.path.join(PATH_TO_FEATURES, pathogen, 'IKs_CheMeleon.txt')).read().splitlines()
    CheMeleons = np.load(os.path.join(PATH_TO_FEATURES, pathogen, "X_CheMeleon.npz"))['X']
    IK_TO_CHEMELEONS = {i: j for i, j in zip(IKs, CheMeleons)}

    break


----------------------- PATHOGEN: abaumannii_organism ---------------------------


In [4]:
# For each task
for task in tasks:

    task = "2_target_CHEMBL614425_mic_um_pchembl_value_5_ORGANISM_1.csv"

    print(f"TASK: {task}")

    # Create output_dir
    output_dir = os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""))
    os.makedirs(output_dir, exist_ok=True)

    # Load data
    df = pd.read_csv(os.path.join("..", "data", pathogen, task))
    cols = df.columns.tolist()
    X, Y = [], []
    for ik, act in zip(df['inchikey'], df[cols[2]]):
        if ik in IK_TO_CHEMELEONS:
            X.append(IK_TO_CHEMELEONS[ik])
            Y.append(act)

    # To np.array
    X = np.array(X)
    Y = np.array(Y)

    print(X.shape)

    break

TASK: 2_target_CHEMBL614425_mic_um_pchembl_value_5_ORGANISM_1.csv
(5924, 2048)


In [5]:
# Cross-validations
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
aurocs = []
for train_index, test_index in skf.split(X, Y):
    # print("Training CV model...")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    break
    # zero_shot_cv = ZeroShotRandomForestClassifier()
    # hyperparams = zero_shot_cv.suggest_hyperparams(X_train, y_train)[0]
    # hyperparams['n_jobs'] = 8
    # # hyperparams['n_estimators'] = 10
    # model_cv = RandomForestClassifier(**hyperparams)
    # model_cv.fit(X_train, y_train)
    # fpr, tpr, _ = roc_curve(y_test, model_cv.predict_proba(X_test)[:, 1])
    # auroc = auc(fpr, tpr)
    # aurocs.append(auroc)

In [9]:
model = LazyRandomForestBinaryClassifier(pca=True, max_samples=100000, num_trials=10)
model.fit(X_train, y_train)

Total samples: 4739, positive samples: 1132, negative samples: 3607
Maximum samples per partition: 100000, minimum samples per partition: 30
Positive proportion: 0.24
Original positive samples: 1132, total samples: 4739
Maximum samples: 100000
Sampling 1132 positive and 3607 negative samples from 4739 total samples.


  1%|          | 10/1000 [00:00<00:12, 77.71it/s]


Unique sampled indices matrix shape: (1, 4739)


100%|██████████| 1/1 [00:05<00:00,  5.58s/it]


Indices matrix shape after redundancy removal: (1, 4739)
Original positive negative balance: positive 1132, negative 3607
Avg positive samples: 1132.0, avg negative samples: 3607.0


[I 2025-07-22 17:19:50,468] A new study created in memory with name: no-name-e2c07da5-d4de-4652-8190-a5f344c932d8


Fitting model on 4739 samples, positive samples: 1132, negative samples: 3607, number of features 1934
Fitting...


[I 2025-07-22 17:21:10,339] Trial 0 finished with value: 0.8894148995177132 and parameters: {'n_components': 0.99}. Best is trial 0 with value: 0.8894148995177132.


Working on the PCA


[I 2025-07-22 17:21:13,811] A new study created in memory with name: no-name-03fe057f-e482-4c5f-9219-82143508eb69


Suggested zero-shot hyperparameters: {'n_estimators': 501, 'max_features': 0.24484242524861066, 'criterion': 'entropy', 'max_leaf_nodes': 1156, 'random_state': 12032022, 'verbose': 0, 'class_weight': 'balanced_subsample'}
Fitting...


[I 2025-07-22 17:23:23,616] Trial 0 finished with value: 0.9012108937343789 and parameters: {'n_estimators': 501, 'max_features': 0.24484242524861066, 'max_leaf_nodes': 1156, 'criterion': 'entropy'}. Best is trial 0 with value: 0.9012108937343789.


Best hyperparameters: {'n_estimators': 501, 'max_features': 0.24484242524861066, 'max_leaf_nodes': 1156, 'criterion': 'entropy', 'n_jobs': 8, 'random_state': 42, 'class_weight': 'balanced_subsample'}, Inner hyperparameter AUROC: 0.9012108937343789
Internal AUROC CV-0: 0.8978829926429684
Internal AUROC CV-1: 0.9134197268731441
Internal AUROC CV-2: 0.8809105012026669
Logistic regression for calibration...
Logistic regression fit done.
Average AUROC: 0.8974044069062598
Model fitted.
Fitting completed in 416.58 seconds.


<lazyqsar.models.random_forest_binary_classifier.LazyRandomForestBinaryClassifier at 0x7cd2630d88d0>

In [10]:
preds = model.predict(X_test)

[VarianceThreshold(threshold=0)] BaseRandomForestBinaryClassifier(num_trials=10, pca=True, timeout=120)


Predicting chunks...: 2it [00:00,  6.76it/s]


In [11]:
roc_auc_score(y_test, preds)

0.9293527535914713

In [8]:
roc_auc_score(y_test, preds)

0.9366627065389004

In [None]:
# model = lazyqsar.LazyBinaryClassifier(model_type = "random_forest")
# model.fit(X, Y)
# preds = model.predict_proba(X)
# print(preds)