In [1]:
import pandas as pd
import sys
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from IPython.display import display, HTML
from scipy.stats import spearmanr
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import zipfile
import random
import gzip
import sys
import h5py
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))


In [12]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

# Shared columns
KEYS = ["assay_id", "activity_type", "unit"]

# Columns to take from each table
COLUMNS_CLEANED = ["assay_id", "assay_type", "assay_organism", "doc_chembl_id", "target_type", "target_chembl_id", "target_organism", "activity_type", 
                "unit", "canonical_unit", "activities", "nan_values", "cpds", "direction", "activity_comment_counts", "standard_text_count"]
COLUMNS_CLUSTERS = ['clusters_0.3', 'clusters_0.6', 'clusters_0.85']
COLUMNS_DATASETS = ["equal", 'higher', 'lower', "dataset_type", "cpds_qt", "min_", "p1", "p25", "p50", "p75", "p99", "max_", "pos_ql", "ratio_ql", "cpds_ql"]

In [15]:
# Load assays info
ASSAYS_CLEANED = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_cleaned.csv"))
ASSAYS_CLUSTERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_clusters.csv"))
ASSAYS_PARAMETERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_parameters.csv"))
ASSAYS_DATASETS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_datasets.csv"))
INDIVIDUAL_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_LM.csv"))

# Get assay to quantitative data info
assay_to_qt_info = defaultdict(list)
for assay_id, activity_type, unit, expert_cutoff, ratio_qt in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit', 'expert_cutoff', 'ratio_qt']].values:
    assay_to_qt_info[tuple([assay_id, activity_type, unit])].append([expert_cutoff, ratio_qt])

# Unique row per assay
ASSAYS_DATASETS = ASSAYS_DATASETS[KEYS + COLUMNS_DATASETS].drop_duplicates().reset_index(drop=True)

# Get cutoffs and ratios
cutoffs = [";".join([str(j[0]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
ratios = [";".join([str(j[1]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]

# Store results
ASSAYS_DATASETS.insert(8, 'cutoffs', cutoffs)
ASSAYS_DATASETS.insert(9, 'ratios', ratios)

In [17]:
len(ASSAYS_CLEANED), len(ASSAYS_CLUSTERS), len(ASSAYS_PARAMETERS), len(ASSAYS_DATASETS)

(10532, 10532, 10532, 10532)

In [25]:
ALL_COLS = ["assay_id", "assay_type", "assay_organism", "target_organism", "organism_curated", "doc_chembl_id", "target_type", "target_type_curated", "target_type_curated_extra", 
          "target_chembl_id", "target_chembl_id_curated", "target_name_curated", "bao_label", "source_label", "strain", "atcc_id", "mutations", "known_drug_resistances", "media",
          "activity_type", "unit", "activities", "nan_values", "cpds", "direction", "act_flag", 'inact_flag', "dataset_type", "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql", 
          "min_", "p1", "p25", "p50", "p75", "p99", "max_"]

In [24]:
ASSAYS_PARAMETERS

Unnamed: 0,assay_id,activity_type,unit,organism_curated,target_type_curated,target_name_curated,target_chembl_id_curated,strain,atcc_id,mutations,known_drug_resistances,media
0,CHEMBL4649948,PERCENTEFFECT,%,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis,,,,,,DPPC; cholesterol; tyloxapol based media
1,CHEMBL4649949,PERCENTEFFECT,%,Mycobacterium tuberculosis,ORGANISM,,,,,,,7H9; glucose tyloxapol based media
2,CHEMBL4649971,PERCENTEFFECT,%,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis,,,,,,
3,CHEMBL4649972,PERCENTEFFECT,%,Mycobacterium tuberculosis,PROTEIN COMPLEX,ClpP1P2,,H37Rv,ATCC 25618,,,
4,CHEMBL4649941,PERCENTEFFECT,%,Mycobacterium tuberculosis,SINGLE PROTEIN,MtCoaBC,,H37Rv,ATCC 25618,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
10527,CHEMBL4153752,INHIBITION,%,Mycobacterium tuberculosis,SINGLE PROTEIN,InhA,,H37Rv,,,,
10528,CHEMBL4153751,INHIBITION,%,Mycobacterium tuberculosis,SINGLE PROTEIN,InhA,,H37Rv,,,isoniazid,
10529,CHEMBL4153750,INHIBITION,%,Mycobacterium tuberculosis,SINGLE PROTEIN,InhA,,H37Rv,,,INH,
10530,CHEMBL5226835,MIC50,umol.L-1,Mycobacterium tuberculosis variant bovis,ORGANISM,,,BCG,,,,


In [3]:
def get_all_results_from_individual_modeling(LABELS):
    RESULTS, CONSIDERED_ASSAYS = {}, {}
    for LABEL in LABELS:
        RESULTS[LABEL] = {}
        CONSIDERED_ASSAYS[LABEL] = set()
        rows = INDIVIDUAL_LM[INDIVIDUAL_LM[LABEL]][["assay_id", "activity_type", "unit", "expert_cutoff", f"{LABEL}_AVG"]].values
        for assay_id, activity_type, unit, expert_cutoff, auroc in rows:
            key = (assay_id, activity_type, unit)
            CONSIDERED_ASSAYS[LABEL].add(key)
            if auroc > 0.7:
                if key not in RESULTS[LABEL]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
                elif auroc > RESULTS[LABEL][key][1]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
    return RESULTS, CONSIDERED_ASSAYS

# Get results from individual modeling ABCD
LABELS = ['A', 'B', 'C', 'D']
ACCEPTED_ASSAYS, CONSIDERED_ASSAYS = get_all_results_from_individual_modeling(LABELS)

In [4]:
def where_considered(key, LABELS, CONSIDERED_ASSAYS):
    considered = []
    for LABEL in LABELS:
        if key in CONSIDERED_ASSAYS[LABEL]:
            considered.append(LABEL)
    if len(considered) > 0:
        return ";".join(considered)
    else:
        return np.nan
    
def where_accepted(key, LABELS, ACCEPTED_ASSAYS):
    accepted = []
    for LABEL in LABELS:
        if key in ACCEPTED_ASSAYS[LABEL]:
            accepted.append(LABEL)
    if len(accepted) > 0:
        return ";".join(accepted)
    else:
        return np.nan

In [7]:
col_accepted, col_considered = [], []
for assay_id, activity_type, unit in ASSAYS_CLEANED[["assay_id", "activity_type", "unit"]].values:
    # Get strategies in which this assay is considered and accepted
    key = tuple([assay_id, activity_type, unit])
    col_considered.append(where_considered(key, LABELS, CONSIDERED_ASSAYS))
    col_accepted.append(where_accepted(key, LABELS, ACCEPTED_ASSAYS))
ASSAYS_CLEANED['Accepted'] = col_accepted
ASSAYS_CLEANED['Considered'] = col_considered

In [11]:
ASSAYS_CLEANED

Unnamed: 0,assay_id,assay_type,assay_organism,doc_chembl_id,target_type,target_chembl_id,target_organism,bao_label,source_label,activity_type,unit,activities,nan_values,cpds,act_flag,inact_flag,frac_cs,direction,Accepted,Considered
0,CHEMBL4649948,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,organism-based format,GATES_LIBRARY,PERCENTEFFECT,%,93555,0,86589,0,0,0.62500,1.0,A,A
1,CHEMBL4649949,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,organism-based format,GATES_LIBRARY,PERCENTEFFECT,%,101515,0,86575,0,0,0.62490,1.0,A,A
2,CHEMBL4649971,F,Mycobacterium tuberculosis,CHEMBL3988442,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,organism-based format,GATES_LIBRARY,PERCENTEFFECT,%,68619,0,68613,0,0,0.49525,1.0,A,A
3,CHEMBL4649972,F,Mycobacterium tuberculosis,CHEMBL3988442,PROTEIN COMPLEX,CHEMBL4662931,Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv),assay format,GATES_LIBRARY,PERCENTEFFECT,%,68616,0,68610,0,0,0.49523,1.0,,A
4,CHEMBL4649941,F,Mycobacterium tuberculosis,CHEMBL3988442,SINGLE PROTEIN,CHEMBL4662928,Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv),assay format,GATES_LIBRARY,PERCENTEFFECT,%,67381,0,66941,0,0,0.48318,1.0,,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,CHEMBL4153752,B,Mycobacterium tuberculosis H37Rv,CHEMBL4152223,SINGLE PROTEIN,CHEMBL1849,Mycobacterium tuberculosis,single protein format,LITERATURE,INHIBITION,%,1,1,1,1,0,0.00001,1.0,,
10528,CHEMBL4153751,B,Mycobacterium tuberculosis H37Rv,CHEMBL4152223,SINGLE PROTEIN,CHEMBL1849,Mycobacterium tuberculosis,single protein format,LITERATURE,INHIBITION,%,1,1,1,1,0,0.00001,1.0,,
10529,CHEMBL4153750,B,Mycobacterium tuberculosis H37Rv,CHEMBL4152223,SINGLE PROTEIN,CHEMBL1849,Mycobacterium tuberculosis,single protein format,LITERATURE,INHIBITION,%,1,1,1,1,0,0.00001,1.0,,
10530,CHEMBL5226835,F,Mycobacterium tuberculosis variant bovis,CHEMBL5226287,ORGANISM,CHEMBL613086,Mycobacterium tuberculosis variant bovis,organism-based format,LITERATURE,MIC50,umol.L-1,1,0,1,0,0,0.00001,-1.0,,
