In [1]:
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from collections import Counter
import sys
import os
import time
# sys.path.insert(0, os.path.abspath("../../lazy-qsar/lazyqsar/models"))
# from xgboost_binary_classifier import LazyXGBoostBinaryClassifier as LQ_XGB
# from zsRF_binary_classifier import LazyZSRFBinaryClassifier as LQ_zsRF
import pandas as pd
import numpy as np
import collections
import joblib
from tqdm import tqdm

from flaml.default import RandomForestClassifier as ZeroShotRandomForestClassifier
from flaml.default import XGBClassifier as ZeroShotXGBoostClassifier
from flaml.default import ExtraTreesClassifier as ZeroShotExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))[:1]

# Define some paths
PATH_TO_FEATURES = os.path.join("..", "output", "02_features")
PATH_TO_OUTPUT = os.path.join("..", "output", "03_baseline_models")

# Setting a seed
np.random.seed(42)

for pathogen in PATHOGENS:

    print(f"----------------------- PATHOGEN: {pathogen} ---------------------------")

    # Get list of tasks
    tasks = sorted(os.listdir(os.path.join("..", "data", pathogen)))

    # For each task
    for task in tasks:

        # # if task is not done yet
        # if os.path.exists(os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_optuna_CV.csv")) == True:
        #     continue

        print(f"TASK: {task}")

        # Create output_dir
        output_dir = os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""))
        os.makedirs(output_dir, exist_ok=True)


        # Get IK to MFP
        IKs = open(os.path.join(PATH_TO_FEATURES, pathogen, 'IKS.txt')).read().splitlines()
        MFPs = np.load(os.path.join(PATH_TO_FEATURES, pathogen, "X.npz"))['X']
        IK_TO_MFP = {i: j for i, j in zip(IKs, MFPs)}

        # Load data
        df = pd.read_csv(os.path.join("..", "data", pathogen, task))
        cols = df.columns.tolist()
        X, Y = [], []
        for ik, act in zip(df['inchikey'], df[cols[2]]):
            if ik in IK_TO_MFP:
                X.append(IK_TO_MFP[ik])
                Y.append(act)

        # To np.array
        X = np.array(X)
        Y = np.array(Y)

        # Subsample
        N = 10000
        if len(X) > N:
            indices = np.random.choice(len(X), size=N, replace=False)
            X = X[indices]
            Y = Y[indices]

        # Cross-validations
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        aurocs_zsRF, aurocs_zsXGB, aurocs_zsET = [], [], []
        time_RF, time_XGB, time_ET = 0, 0, 0
        for train_index, test_index in tqdm(skf.split(X, Y)):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]

            # zsRF
            init = time.time()
            zero_shot = ZeroShotRandomForestClassifier()
            hyperparams = zero_shot.suggest_hyperparams(X_train, y_train)[0]
            hyperparams['n_jobs'] = 12
            model_zsRF = RandomForestClassifier(**hyperparams)
            model_zsRF.fit(X_train, y_train)
            fpr, tpr, _ = roc_curve(y_test, model_zsRF.predict_proba(X_test)[:,1])
            auroc = round(auc(fpr, tpr), 3)
            aurocs_zsRF.append(auroc)
            time_RF += time.time() - init

            # zsXGB
            init = time.time()
            zero_shot = ZeroShotXGBoostClassifier() 
            hyperparams = zero_shot.suggest_hyperparams(X_train, y_train)[0]
            hyperparams['n_jobs'] = 12
            model_zsXGB = XGBClassifier(**hyperparams)
            model_zsXGB.fit(X_train, y_train)
            fpr, tpr, _ = roc_curve(y_test, model_zsXGB.predict_proba(X_test)[:, 1])
            auroc = round(auc(fpr, tpr), 3)
            aurocs_zsXGB.append(auroc)
            time_XGB += time.time() - init

            # zsET
            init = time.time()
            zero_shot = ZeroShotExtraTreesClassifier() 
            hyperparams = zero_shot.suggest_hyperparams(X_train, y_train)[0]
            hyperparams['n_jobs'] = 12
            model_zsET = ExtraTreesClassifier(**hyperparams)
            model_zsET.fit(X_train, y_train)
            fpr, tpr, _ = roc_curve(y_test, model_zsET.predict_proba(X_test)[:, 1])
            auroc = round(auc(fpr, tpr), 3)
            aurocs_zsET.append(auroc)
            time_ET += time.time() - init

        print(f"{len(X)} samples; {Counter(Y)[1]} positives; {len(aurocs_zsRF)}-fold CV")
        print(f"AUROCs zsRF: {aurocs_zsRF} -- average: {round(np.mean(aurocs_zsRF), 3)} -- time: {round(time_RF, 2)}s")
        print(f"AUROCs zsXGB: {aurocs_zsXGB} -- average: {round(np.mean(aurocs_zsXGB), 3)} -- time: {round(time_XGB, 2)}s")
        print(f"AUROCs zsET: {aurocs_zsET} -- average: {round(np.mean(aurocs_zsET), 3)} -- time: {round(time_ET, 2)}s")

----------------------- PATHOGEN: abaumannii_organism ---------------------------
TASK: 1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_10_ORGANISM_1.csv


3it [45:32, 910.85s/it]


10000 samples; 967 positives; 3-fold CV
AUROCs zsRF: [0.692, 0.71, 0.703] -- average: 0.702 -- time: 184.8s
AUROCs zsXGB: [0.692, 0.707, 0.686] -- average: 0.695 -- time: 1269.16s
AUROCs zsET: [0.684, 0.695, 0.683] -- average: 0.687 -- time: 1278.56s
TASK: 1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_1_ORGANISM_1.csv


1it [10:05, 605.31s/it]