In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from flaml.automl import AutoML
from collections import Counter
from lazyqsar.models import LazyXGBoostBinaryClassifier as LQ_XGB
import pandas as pd
import numpy as np
import collections
import joblib
import os

In [None]:
# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))[:1]

# Define some paths
PATH_TO_FEATURES = os.path.join("..", "output", "02_features")
PATH_TO_OUTPUT = os.path.join("..", "output", "03_baseline_models")

for pathogen in PATHOGENS:

    print(f"----------------------- PATHOGEN: {pathogen} ---------------------------")

    # Get list of tasks
    tasks = sorted(os.listdir(os.path.join("..", "data", pathogen)))

    # For each task
    for task in tasks[:1]:

        # if task is not done yet
        if os.path.exists(os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_optuna_CV.csv")) == True:
            continue

        print(f"TASK: {task}")

        # Create output_dir
        output_dir = os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""))
        os.makedirs(output_dir, exist_ok=True)


        # Get IK to MFP
        IKs = open(os.path.join(PATH_TO_FEATURES, pathogen, 'IKS.txt')).read().splitlines()
        MFPs = np.load(os.path.join(PATH_TO_FEATURES, pathogen, "X.npz"))['X']
        IK_TO_MFP = {i: j for i, j in zip(IKs, MFPs)}

        # Load data
        df = pd.read_csv(os.path.join("..", "data", pathogen, task))
        cols = df.columns.tolist()
        X, Y = [], []
        for ik, act in zip(df['inchikey'], df[cols[2]]):
            if ik in IK_TO_MFP:
                X.append(IK_TO_MFP[ik])
                Y.append(act)

        # To np.array
        X = np.array(X)
        Y = np.array(Y)

        # # Subsample
        # indices = np.random.choice(len(X_smiles), size=100, replace=False)
        # X_smiles = X_smiles[indices]
        # Y = Y[indices]

        # try:

        # # Fit model with all data
        # model_all = lq.LazyBinaryQSAR(descriptor_type='morgan', model_type="xgboost")
        # model_all.fit(X_smiles, Y)
        # model_all.save_model(model_dir=os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_optuna"))

        # Cross-validations
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        aurocs = []
        for train_index, test_index in skf.split(X, Y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            model_cv = LQ_XGB(reducer_method='pca')
            model_cv.fit(X=X_train, y=y_train)
            fpr, tpr, _ = roc_curve(y_test, model_cv.predict(X_test))
            auroc = auc(fpr, tpr)
            aurocs.append(auroc)

        # Save AUROC CVs
        with open(os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""), "LQ_optuna_CV.csv"), "w") as f:
            f.write(",".join([str(round(i, 4)) for i in aurocs]))

        # except:

        #     pass

In [None]:
fpr, tpr, _ = roc_curve(y_test, model_cv.predict(X_test))

In [None]:
auc(fpr, tpr)