In [None]:
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from collections import Counter
import sys
import os
import time
# sys.path.insert(0, os.path.abspath("../../lazy-qsar/lazyqsar/models"))
# from xgboost_binary_classifier import LazyXGBoostBinaryClassifier as LQ_XGB
# from zsRF_binary_classifier import LazyZSRFBinaryClassifier as LQ_zsRF
import pandas as pd
import numpy as np
import collections
import joblib

from flaml.default import RandomForestClassifier as ZeroShotRandomForestClassifier
from flaml.default import XGBClassifier as ZeroShotXGBoostClassifier
from flaml.default import ExtraTreesClassifier as ZeroShotExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))[:1]

# Define some paths
PATH_TO_FEATURES = os.path.join("..", "output", "02_features")
PATH_TO_OUTPUT = os.path.join("..", "output", "03_baseline_models")

# Setting a seed
np.random.seed(42)

for pathogen in PATHOGENS:

    print(f"----------------------- PATHOGEN: {pathogen} ---------------------------")

    # Get list of tasks
    tasks = sorted(os.listdir(os.path.join("..", "data", pathogen)))

    # For each task
    for task in tasks[:1]:

        # # if task is not done yet
        # if os.path.exists(os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_optuna_CV.csv")) == True:
        #     continue

        print(f"TASK: {task}")

        # Create output_dir
        output_dir = os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""))
        os.makedirs(output_dir, exist_ok=True)


        # Get IK to MFP
        IKs = open(os.path.join(PATH_TO_FEATURES, pathogen, 'IKS.txt')).read().splitlines()
        MFPs = np.load(os.path.join(PATH_TO_FEATURES, pathogen, "X.npz"))['X']
        IK_TO_MFP = {i: j for i, j in zip(IKs, MFPs)}

        # Load data
        df = pd.read_csv(os.path.join("..", "data", pathogen, task))
        cols = df.columns.tolist()
        X, Y = [], []
        for ik, act in zip(df['inchikey'], df[cols[2]]):
            if ik in IK_TO_MFP:
                X.append(IK_TO_MFP[ik])
                Y.append(act)

        # To np.array
        X = np.array(X)
        Y = np.array(Y)

        # Subsample
        indices = np.random.choice(len(X), size=1000, replace=False)
        X = X[indices]
        Y = Y[indices]

        # Cross-validations
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        aurocs_zsRF, aurocs_zsXGB = [], []
        time_RF, time_XGB, time_ET = 0, 0, 0
        for train_index, test_index in skf.split(X, Y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]

            # zsRF
            init = time.time()
            zero_shot = ZeroShotRandomForestClassifier()
            hyperparams = zero_shot.suggest_hyperparams(X_train, y_train)[0]
            hyperparams['n_jobs'] = 12
            model_zsRF = RandomForestClassifier(**hyperparams)
            model_zsRF.fit(X_train, y_train)
            fpr, tpr, _ = roc_curve(y_test, model_zsRF.predict_proba(X_test)[:,1])
            auroc = round(auc(fpr, tpr), 3)
            aurocs_zsRF.append(auroc)
            time_RF += time.time() - init

            # zsXGB
            init = time.time()
            zero_shot = ZeroShotXGBoostClassifier() 
            hyperparams = zero_shot.suggest_hyperparams(X_train, y_train)[0]
            hyperparams['n_jobs'] = 12
            model_zsXGB = XGBClassifier(**hyperparams)
            model_zsXGB.fit(X_train, y_train)
            fpr, tpr, _ = roc_curve(y_test, model_zsXGB.predict_proba(X_test)[:, 1])
            auroc = round(auc(fpr, tpr), 3)
            aurocs_zsXGB.append(auroc)
            time_XGB += time.time() - init

            break

        print(f"AUROCs zsRF: {aurocs_zsRF} -- time: {round(time_RF, 2)}s")
        print(f"AUROCs zsXGB: {aurocs_zsXGB} -- time: {round(time_XGB, 2)}s")


        # # Save AUROC CVs zsRF
        # with open(os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_zsRF_CV.csv"), "w") as f:
        #     f.write(",".join([str(round(i, 4)) for i in aurocs_zsRF]))

        # # Save AUROC CVs XGB
        # with open(os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_XGB_CV.csv"), "w") as f:
        #     f.write(",".join([str(round(i, 4)) for i in aurocs_XGB
        #                       ]))

----------------------- PATHOGEN: abaumannii_organism ---------------------------
TASK: 1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_10_ORGANISM_1.csv
AUROCs zsRF: [0.612] -- time: 1.31s
AUROCs zsXGB: [0.676] -- time: 63.78s
