In [11]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import joblib
import os

# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))

# Define some paths
PATH_TO_FEATURES = os.path.join("..", "output", "02_features")
PATH_TO_OUTPUT = os.path.join("..", "output", "03_baseline_models")

In [15]:
def NaiveBayesClassificationModel(X, Y, n_folds=5):

    # Fit model with all data
    model_all = MultinomialNB()
    model_all.fit(X, Y)

    # Cross-validations
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    aurocs = []
    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        model_cv = MultinomialNB()
        model_cv.fit(X_train, y_train)
        fpr, tpr, _ = roc_curve(y_test, model_cv.predict_proba(X_test)[:, 1])
        auroc = auc(fpr, tpr)
        aurocs.append(auroc)

    return model_all, [str(round(i, 4)) for i in aurocs]

In [21]:
for pathogen in PATHOGENS:

    # Get list of tasks
    tasks = sorted(os.listdir(os.path.join("..", "data", pathogen)))

    # Get IK to MFP
    IKs = open(os.path.join(PATH_TO_FEATURES, pathogen, 'IKS.txt')).read().splitlines()
    MFPs = np.load(os.path.join(PATH_TO_FEATURES, pathogen, "X.npz"))['X']
    IK_TO_MFP = {i: j for i, j in zip(IKs, MFPs)}

    # For each task
    for task in tasks:

        # Create output_dir
        output_dir = os.path.join(PATH_TO_OUTPUT, pathogen, task.replace(".csv", ""))
        os.makedirs(output_dir, exist_ok=True)

        # Load data
        df = pd.read_csv(os.path.join("..", "data", pathogen, task))
        cols = df.columns.tolist()
        X, Y = [], []
        for ik, act in zip(df['inchikey'], df[cols[2]]):
            if ik in IK_TO_MFP:
                X.append(IK_TO_MFP[ik])
                Y.append(act)

        # To np.array
        X = np.array(X)
        Y = np.array(Y)

        # Naive Bayes
        NB, results_NB = NaiveBayesClassificationModel(X, Y)
        joblib.dump(NB, os.path.join(output_dir, "NB.joblib"))
        with open(os.path.join(output_dir, "NB_CV.csv"), "w") as f:
            f.write(",".join(results_NB))

        # Random Forest

        # FLAML

        break

    break

'0.7015,0.7142,0.7052,0.6905,0.7068'