In [44]:
# imports
import os
import csv
import sys
import tqdm
import joblib
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

# root = os.path.abspath(os.path.dirname(__file__))
# sys.path.append(root)
pathogen = "abaumannii_organism"
PATH_TO_MODELS = "../other/models"
PATH_TO_OUTPUT = os.path.join(PATH_TO_MODELS, f"model_{pathogen}")
root = os.path.join(PATH_TO_OUTPUT, "framework", "code")

# current file directory
checkpoints_dir = os.path.join(root, "..", "..", "checkpoints")
tasks = pd.read_csv(os.path.join(root, "..", "columns", "run_columns.csv"))['name'].tolist()

# parse arguments
# input_file = sys.argv[1]
# output_file = sys.argv[2]

input_file = os.path.join(root, "..", "examples", "run_input.csv")
output_file = os.path.join(root, "..", "examples", "run_output.csv")

# Read smiles
with open(input_file, "r") as f:
    smiles = []
    reader = csv.reader(f)
    next(reader)
    for r in reader:
        smiles += [r[0]]

# Get Morgan fingerprints
X = []
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
for smi in smiles:
    mol = Chem.MolFromSmiles(smi)
    mfp = mfpgen.GetCountFingerprint(mol)
    X.append(mfp.ToList())

# Convert to numpy array
X = np.array(X, dtype=np.int16)

# Create output DataFrame
OUTPUT = pd.DataFrame({"smiles": smiles})  # We will remove this column later

# For each task, load the model and make predictions
for task in tasks:

    # Load the model
    model = joblib.load(os.path.join(checkpoints_dir, task + "_RF.joblib"))

    # Save predictions
    preds = model.predict_proba(X)[:, 1]
    OUTPUT[task] = preds
    OUTPUT[task] = OUTPUT[task].astype(float)

# Remove smiles column
OUTPUT.drop(columns=["smiles"], inplace=True)
columns = OUTPUT.columns.tolist()

# Load PowerTransformer
pt = joblib.load(os.path.join(PATH_TO_OUTPUT, "checkpoints", "RF_PowerTransformer.joblib"))

# Power Transform the predictions
OUTPUT_transformed = pt.transform(OUTPUT)
OUTPUT_transformed = pd.DataFrame(OUTPUT_transformed, columns=columns)

# Load dataset report
report = pd.read_csv(os.path.join(root, "..", "..", "checkpoints", "018_selected_tasks_FINAL.csv"))

In [45]:
# Get dataset metadata
task_to_auroc, task_to_priority, task_to_positives, task_to_max_correlation = {}, {}, {}, {}
for task, auroc_avg_MOD, num_pos_samples, auroc_avg_DIS, priority in zip(report["task"], report["auroc_avg_MOD"], report["num_pos_samples"], report["auroc_avg_DIS"], report["priority"]):
    task_to_priority[task] = priority
    task_to_positives[task] = num_pos_samples
    if auroc_avg_MOD > 0.7:
        task_to_auroc[task] = auroc_avg_MOD
    else:
        task_to_auroc[task] = auroc_avg_DIS

In [61]:
# Load correlations
correlations = pd.read_csv(os.path.join("../output/05_correlations/05_correlations.tsv"), sep='\t')
correlations = correlations[(correlations['Pathogen1'] == pathogen) & 
                            (correlations['Pathogen2'] == pathogen) &
                            (correlations['Model1'] == "RF") & 
                            (correlations['Model2'] == "RF") & 
                            (correlations['Same task'] == False)].reset_index(drop=True)

In [70]:
# Dict mapping tasks to correlations
task_to_correlations = {}

for task in tasks:

    # Initialize the task in the dictionary
    task_to_correlations[task] = {}

    # Get the correlations for the task
    corr = correlations[(correlations["Task1"] == task) | (correlations["Task2"] == task)]
    for t1, t2, corr_value in zip(corr["Task1"], corr["Task2"], corr["Spearman statistic"]):
        if t1 == task:
            task_to_correlations[task][t2] = round(corr_value, 3)
        else:
            task_to_correlations[task][t1] = round(corr_value, 3)

In [76]:
[i for i in task_to_correlations if len(task_to_correlations[i]) == 0]

['2_target_CHEMBL614425_inhibition__percentage_activity_50_ORGANISM_1',
 '2_target_CHEMBL614425_inhibition__percentage_activity_percentile_1_ORGANISM_1']

In [74]:
[len(task_to_correlations[i]) for i in task_to_correlations]

[24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 0,
 0,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24]

Unnamed: 0,Pathogen1,Task1,Model1,Pathogen2,Task2,Model2,Spearman statistic,Spearman pvalue,Pearson statistic,Pearson pvalue,Kendall statistic,Kendall pvalue,Same pathogen,Same task,Same model
0,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,0.715976,0.0,0.624602,0.0,0.532965,0.0,True,False,True
1,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,-0.253953,1.0059739999999999e-174,-0.196739,2.831306e-104,-0.169547,1.4703450000000002e-169,True,False,True
2,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,0.918818,0.0,0.91053,0.0,0.763619,0.0,True,False,True
3,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_MIC_pchembl_percentile_1...,RF,0.162216,4.677668e-71,0.133934,8.118881e-49,0.109974,5.36366e-72,True,False,True
4,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_MIC_pchembl_percentile_1...,RF,0.130144,3.60718e-46,0.037962,3.400579e-05,0.101627,1.409649e-46,True,False,True
5,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_MIC_pchembl_percentile_5...,RF,0.314676,4.199114e-272,0.162859,1.296153e-71,0.217745,2.383902e-270,True,False,True
6,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_MIC_pchembl_value_5_ORGA...,RF,0.325813,1.112536e-292,0.184341,1.4563819999999998e-91,0.232418,1.974653e-290,True,False,True
7,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296193_Inhibition_percentage_ac...,RF,-0.180366,1.1262969999999998e-87,-0.243779,9.660983999999999e-161,-0.118794,7.606852e-84,True,False,True
8,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296193_Inhibition_percentage_ac...,RF,-0.197961,1.414375e-105,-0.249244,3.590058e-168,-0.131245,6.051373e-102,True,False,True
9,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,2_target_CHEMBL614425_inhibition_%_percentage_...,RF,0.411012,0.0,0.144244,2.049279e-56,0.310033,0.0,True,False,True


In [64]:
correlations

Unnamed: 0,Pathogen1,Task1,Model1,Pathogen2,Task2,Model2,Spearman statistic,Spearman pvalue,Pearson statistic,Pearson pvalue,Kendall statistic,Kendall pvalue,Same pathogen,Same task,Same model
0,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,0.715976,0.000000e+00,0.624602,0.000000e+00,0.532965,0.000000e+00,True,False,True
1,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,-0.253953,1.005974e-174,-0.196739,2.831306e-104,-0.169547,1.470345e-169,True,False,True
2,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,0.918818,0.000000e+00,0.910530,0.000000e+00,0.763619,0.000000e+00,True,False,True
3,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_MIC_pchembl_percentile_1...,RF,0.162216,4.677668e-71,0.133934,8.118881e-49,0.109974,5.363660e-72,True,False,True
4,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,RF,abaumannii_organism,1_assay_CHEMBL4296188_MIC_pchembl_percentile_1...,RF,0.130144,3.607180e-46,0.037962,3.400579e-05,0.101627,1.409649e-46,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,abaumannii_organism,4_all_percentage_activity_90_ORGANISM_1,RF,abaumannii_organism,4_all_percentage_activity_percentile_5_ORGANISM_1,RF,0.510625,0.000000e+00,0.339751,1.100185e-319,0.393347,0.000000e+00,True,False,True
296,abaumannii_organism,4_all_percentage_activity_90_ORGANISM_1,RF,abaumannii_organism,5_grouped_percentiles_1_ORGANISM_1,RF,0.467544,0.000000e+00,0.217285,2.603083e-127,0.357466,0.000000e+00,True,False,True
297,abaumannii_organism,4_all_percentage_activity_percentile_1_ORGANISM_1,RF,abaumannii_organism,4_all_percentage_activity_percentile_5_ORGANISM_1,RF,0.649607,0.000000e+00,0.560865,0.000000e+00,0.500641,0.000000e+00,True,False,True
298,abaumannii_organism,4_all_percentage_activity_percentile_1_ORGANISM_1,RF,abaumannii_organism,5_grouped_percentiles_1_ORGANISM_1,RF,0.636750,0.000000e+00,0.417454,0.000000e+00,0.484707,0.000000e+00,True,False,True
