In [31]:
from flaml.default import RandomForestClassifier as ZeroShotRandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from flaml.automl import AutoML
from collections import Counter
import lazyqsar as lq
import pandas as pd
import numpy as np
import collections
import joblib
import os

# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))[:1]

# Define some paths
PATH_TO_FEATURES = os.path.join("..", "output", "02_features")
PATH_TO_OUTPUT = os.path.join("..", "output", "03_baseline_models")

In [37]:
for pathogen in PATHOGENS:

    print(f"----------------------- PATHOGEN: {pathogen} ---------------------------")

    # Get list of tasks
    tasks = sorted(os.listdir(os.path.join("..", "data", pathogen)))

    # # Get IK to MFP
    # IKs = open(os.path.join(PATH_TO_FEATURES, pathogen, 'IKS.txt')).read().splitlines()
    # MFPs = np.load(os.path.join(PATH_TO_FEATURES, pathogen, "X.npz"))['X']
    # IK_TO_MFP = {i: j for i, j in zip(IKs, MFPs)}

    # For each task
    for task in tasks:

        print(f"TASK: {task}")

        # Create output_dir
        output_dir = os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""))
        os.makedirs(output_dir, exist_ok=True)

        # Load data
        df = pd.read_csv(os.path.join("..", "data", pathogen, task))
        cols = df.columns.tolist()
        X_smiles, Y = [], []
        for smiles, act in zip(df['smiles'], df[cols[2]]):
            X_smiles.append(smiles)
            Y.append(act)

        # To np.array
        X_smiles = np.array(X_smiles)
        Y = np.array(Y)

        # Random sample
        indices = np.random.choice(len(X_smiles), size=1000, replace=False)
        X_smiles = X_smiles[indices]
        Y = Y[indices]

        # print("Training NB...")

        # # Naive Bayes
        # NB, results_NB = NaiveBayesClassificationModel(X, Y)
        # joblib.dump(NB, os.path.join(output_dir, "NB.joblib"))
        # with open(os.path.join(output_dir, "NB_CV.csv"), "w") as f:
        #     f.write(",".join(results_NB))

        break


----------------------- PATHOGEN: abaumannii_organism ---------------------------
TASK: 1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_10_ORGANISM_1.csv


In [None]:
def LazyQSAR(X_smiles, Y, n_folds=5, descriptor_type='morgan', model_type="xgboost"):

    # # Fit model with all data
    # model_all = MultinomialNB()
    # model_all.fit(X, Y)
    model_all = -1

    # Cross-validations
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    aurocs = []
    for train_index, test_index in skf.split(X_smiles, Y):
        X_train, X_test = X_smiles[train_index], X_smiles[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        # Available descriptors: morgan, mordred, rdkit, classic, maccs
        # Available models: xgboost
        model_cv = lq.LazyBinaryQSAR(descriptor_type=descriptor_type, model_type=model_type)
        model_cv.fit(X=X_train, y=y_train)
        # model.save_model(model_dir="my_model")
        fpr, tpr, _ = roc_curve(y_test, model_cv.predict_proba(X_test)[:, 1])
        auroc = auc(fpr, tpr)
        aurocs.append(auroc)

    return model_all, [str(round(i, 4)) for i in aurocs]

In [42]:
# LazyQSAR(X_smiles, Y)

In [44]:
len(X_smiles)

1000

In [None]:
# Cross-validations
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aurocs = []
for train_index, test_index in skf.split(X_smiles, Y):
    X_train, X_test = X_smiles[train_index], X_smiles[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    # Available descriptors: morgan, mordred, rdkit, classic, maccs
    # Available models: xgboost
    model_cv = lq.LazyBinaryQSAR(descriptor_type='morgan', model_type="xgboost")
    model_cv.fit(X=X_train, y=y_train)
    # model.save_model(model_dir="my_model")
    fpr, tpr, _ = roc_curve(y_test, model_cv.predict_proba(X_test)[:, 1])
    auroc = auc(fpr, tpr)
    aurocs.append(auroc)

[I 2025-05-30 12:40:25,114] A new study created in memory with name: no-name-331318eb-2dee-4015-b314-cb88923e0c2d


Total samples: 800, positive samples: 87, negative samples: 713
Positive proportion: 0.11
Sampling rounds: 1, positive samples per round: 87, negative samples per round: 713
Desired positive proportion: 0.10875 Actual positive proportion:  0.10875
Fitting model on 800 samples, positive samples: 87, negative samples: 713, number of features 500


[I 2025-05-30 12:40:28,380] Trial 0 finished with value: 0.8725740551583248 and parameters: {'booster': 'gblinear', 'lambda': 0.0006155564318973012, 'alpha': 1.77071686435378e-07, 'subsample': 0.32479561626896214, 'colsample_bytree': 0.24646688973455957}. Best is trial 0 with value: 0.8725740551583248.
[I 2025-05-30 12:40:35,547] Trial 1 finished with value: 0.7448927477017363 and parameters: {'booster': 'gbtree', 'lambda': 1.4610865886287176e-08, 'alpha': 0.574485163632042, 'subsample': 0.8659541126403374, 'colsample_bytree': 0.36987128854262097, 'max_depth': 3, 'min_child_weight': 3, 'eta': 2.716051144654844e-06, 'gamma': 0.00015777981883364995, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.8725740551583248.
[I 2025-05-30 12:40:43,826] Trial 2 finished with value: 0.5 and parameters: {'booster': 'gbtree', 'lambda': 8.528933855762793e-06, 'alpha': 4.452048365748842e-05, 'subsample': 0.8281407691144109, 'colsample_bytree': 0.3597390257266878, 'max_depth': 7, 'min_child_wei

Model fitted.


AttributeError: 'LazyBinaryQSAR' object has no attribute 'predict_proba'

In [55]:
a = model_cv.predict(X_test)

Predicting chunk of size: 200


In [59]:
X_test.shape

(200,)

In [60]:
print(a)

[ 0.2314853  -0.0862231  -0.4197334  -0.3009096  -0.10511956 -0.00630671
 -0.16804348  0.5398228   0.22565044  0.11570578  0.06156326 -0.12521943
 -0.06170792  0.6926501   0.23784302  0.4113321  -0.02983759  0.1632947
 -0.11111438  0.12008161  0.39614773 -0.3524479   0.00710342  0.11333106
 -0.0750165   0.1475558   0.6779315   0.24491045 -0.00718679  0.22270526
  0.23729686  0.10788529  0.18090928 -0.22303812  0.13810569  0.2442018
  0.25730193  0.07344029  0.10805137 -0.19030558  0.4024185  -0.08053929
 -0.090104    0.28182003  0.49532464 -0.03928961  0.5734352   0.2106027
  0.03829022  2.9436882   0.27894974  0.58591497  0.81749815 -0.18575706
  0.00336445  0.18219715  0.13996355  0.26889938  0.5150548  -0.07058168
  0.14287047  0.02768525  0.3822545   0.12296981 -0.10659203 -0.10124943
  0.33155262  0.6759873  -0.2637986   0.5330967   0.10283295  0.01646838
  0.3252213   0.1260298   0.19794358  0.15615746  0.05478663  0.02477241
  0.593806   -0.09436278  0.17958385  0.2061103   0.37