In [14]:
from collections import defaultdict
from scipy.stats import spearmanr
from collections import Counter
import numpy as np
import pandas as pd
import sys
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_colwidth", 50)
pd.set_option("display.width", None)

In [3]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Define output directory
OUTPUT = os.path.join(root, "..", "output")

# Path to correlations
PATH_TO_CORRELATIONS = os.path.join(OUTPUT, "mtuberculosis", "correlations")
STRATEGIES = ['A', 'B', "M"]

# Load probs ref data
PROBS_REF = {}
for strategy in STRATEGIES:
    PROBS_REF[strategy] = {}
    if os.path.exists(os.path.join(PATH_TO_CORRELATIONS, strategy)):
        for dataset in sorted(os.listdir(os.path.join(PATH_TO_CORRELATIONS, strategy))):
            name = dataset.replace("_ref_probs.npz", "")
            probs = np.load(os.path.join(PATH_TO_CORRELATIONS, strategy, dataset))['y_prob_ref']
            PROBS_REF[strategy][name] = probs

# Load ChEMBL data for pathogen
ChEMBL_pathogen = pd.read_csv(os.path.join(OUTPUT, pathogen_code, f"{pathogen_code}_ChEMBL_cleaned_data.csv.gz"), low_memory=False)

# Dict mapping assay_id, activity_type and unit to a set of compound ChEMBL IDs
ASSAY_TO_COMPOUNDS = defaultdict(set)
for assay_id, activity_type, unit, compound_chembl_id in ChEMBL_pathogen[["assay_chembl_id", "activity_type", "unit", "compound_chembl_id"]].values:
    ASSAY_TO_COMPOUNDS[(assay_id, activity_type, unit)].add(compound_chembl_id)
del ChEMBL_pathogen

# Get all compounds from pathogen
compounds = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "compound_counts.csv.gz"))
compounds = set(compounds['compound_chembl_id'])

In [4]:
def calculate_spearman(probs1, probs2):
    return spearmanr(probs1, probs2)

def hit_overlap_chance(probs1, probs2, TOP=100):
    N = len(probs1)
    ind1 = set(np.argsort(probs1)[::-1][:TOP])
    ind2 = set(np.argsort(probs2)[::-1][:TOP])
    m = len(ind1.intersection(ind2))
    expected = TOP * TOP/N  # expected intersection size under chance
    return (m - expected) / (TOP - expected)

In [5]:
# Load data
INDIVIDUAL_SELECTED_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_selected_LM.csv"))
MERGED_SELECTED_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "merged_selected_LM.csv"))
print(f"Total number of datasets: {len(INDIVIDUAL_SELECTED_LM) + len(MERGED_SELECTED_LM)}")

# Filtering for ORGANISM
INDIVIDUAL_SELECTED_LM = INDIVIDUAL_SELECTED_LM[INDIVIDUAL_SELECTED_LM['target_type'] == 'ORGANISM'].reset_index(drop=True)
MERGED_SELECTED_LM = MERGED_SELECTED_LM[MERGED_SELECTED_LM['target_type'] == 'ORGANISM'].reset_index(drop=True)
print(f"Total number of ORGANISM datasets: {len(INDIVIDUAL_SELECTED_LM) + len(MERGED_SELECTED_LM)}")

# Defining final number of compounds in A & B datasets (max among qt and mx)
INDIVIDUAL_SELECTED_LM['cpds'] = INDIVIDUAL_SELECTED_LM[['cpds_qt', 'cpds_mx']].max(axis=1)
INDIVIDUAL_SELECTED_LM['positives'] = INDIVIDUAL_SELECTED_LM[['pos_qt', 'pos_mx']].max(axis=1)

# Define names in A & B datasets
names_AB = []
for assay_id, activity_type, unit, dataset_type, cutoff in INDIVIDUAL_SELECTED_LM[["assay_id", "activity_type", "unit", "dataset_type", "cutoff"]].values:
    dty = "qt" if dataset_type == "quantitative" else "mx"
    names_AB.append(f"{assay_id}_{activity_type}_{unit}_{dty}_{cutoff}")
INDIVIDUAL_SELECTED_LM['name'] = names_AB

# Join tables
INDIVIDUAL_SELECTED_LM_tmp = INDIVIDUAL_SELECTED_LM.copy()
INDIVIDUAL_SELECTED_LM_tmp['assay_keys'] = [str(tuple(i)) for i in INDIVIDUAL_SELECTED_LM_tmp[['assay_id', 'activity_type', 'unit']].values]
INDIVIDUAL_SELECTED_LM_tmp = INDIVIDUAL_SELECTED_LM_tmp.drop(columns=['assay_id', 'is_mid_cutoff', 'pos_qt', 'ratio_qt', 'cpds_qt', 'pos_ql', 'ratio_ql', 'cpds_ql', 'pos_mx', 'ratio_mx', 'cpds_mx', 'overlap_mx'])
INDIVIDUAL_SELECTED_LM_tmp['n_assays'] = 1
MERGED_SELECTED_LM_tmp = MERGED_SELECTED_LM.copy()
MERGED_SELECTED_LM_tmp = MERGED_SELECTED_LM_tmp.drop(columns=['direction', 'assay_type', 'bao_label', 'is_mid_cutoff', 'ratio', 'avg', 'std', 'strain', 'target_chembl_id'])
MERGED_SELECTED_LM_tmp = MERGED_SELECTED_LM_tmp.rename(columns={'n_cpds_union': 'cpds'})
MERGED_SELECTED_LM_tmp['label'] = 'M'
MERGED_SELECTED_LM_tmp['dataset_type'] = np.nan
FINAL_DATASETS = pd.concat([INDIVIDUAL_SELECTED_LM_tmp, MERGED_SELECTED_LM_tmp], ignore_index=True)

# Sorting by compounds
FINAL_DATASETS = FINAL_DATASETS.sort_values(by=["label", "cpds"], ascending=[True, False])

# Mapping names to assay keys
name_to_assaykeys = {i: [eval(k) for k in j.split(";")] for i,j in zip(FINAL_DATASETS['name'], FINAL_DATASETS['assay_keys'])}

# Get names
NAMES = FINAL_DATASETS[['label', 'name']].values.tolist()

Total number of datasets: 36
Total number of ORGANISM datasets: 29


In [6]:
# Map name to compounds
NAME_TO_COMPOUNDS = {}
for st, name in NAMES:
    assays = name_to_assaykeys[name]
    cpds = set([cpd for assay in assays for cpd in ASSAY_TO_COMPOUNDS[assay]])
    NAME_TO_COMPOUNDS[name] = cpds

In [7]:
RESULTS = []
RESULTS_DICT = dict()
for n1 in NAMES:
    for n2 in NAMES:
        st1, name1 = n1
        st2, name2 = n2
        probs1 = PROBS_REF[st1][name1]
        probs2 = PROBS_REF[st2][name2]
        cpds1 = NAME_TO_COMPOUNDS[name1]
        cpds2 = NAME_TO_COMPOUNDS[name2]
        a = round(calculate_spearman(probs1, probs2).statistic, 4)
        b = round(hit_overlap_chance(probs1, probs2, TOP=1000), 4)
        c = round(hit_overlap_chance(probs1, probs2, TOP=100), 4)
        d = round(len(cpds1.intersection(cpds2)) / min(len(cpds1), len(cpds2)), 4)
        RESULTS.append([st1, name1, st2, name2, a, b, c, d])
        RESULTS_DICT[(st1, name1, st2, name2)] = (a, b, c, d)

RESULTS = pd.DataFrame(RESULTS, columns=['strategy_1', 'name_1', 'strategy_2', 'name_2', 'spearman', 'hit_overlap_1000', 'hit_overlap_100', 'compound_overlap'])
RESULTS.to_csv(os.path.join(OUTPUT, pathogen_code, 'dataset_correlations.csv'), index=False)

In [18]:
SELECTED = []

for label, name in FINAL_DATASETS[['label', 'name']].values:

    select = True
    for previously_selected in SELECTED:
        st1, name1 = label, name
        st2, name2 = previously_selected
        a, b, c, d = RESULTS_DICT[(st1, name1, st2, name2)]
        if a+b+c/3 > 0.5 and d > 0.5:
            select = False
            break
    
    if select == True:
        SELECTED.append([label, name])

FINAL_DATASETS['selected'] = [i in SELECTED for i in FINAL_DATASETS[['label', 'name']].values.tolist()]
FINAL_DATASETS.to_csv(os.path.join(OUTPUT, pathogen_code, 'final_datasets.csv'), index=False)
FINAL_DATASETS

Unnamed: 0,label,activity_type,unit,target_type,cutoff,AUROC,dataset_type,cpds,positives,name,assay_keys,n_assays,selected
4,A,PERCENTEFFECT,%,ORGANISM,50.0,0.706,quantitative,86589.0,1268.0,CHEMBL4649948_PERCENTEFFECT_%_qt_50.0,"('CHEMBL4649948', 'PERCENTEFFECT', '%')",1,True
0,A,PERCENTEFFECT,%,ORGANISM,50.0,0.74,quantitative,86575.0,2181.0,CHEMBL4649949_PERCENTEFFECT_%_qt_50.0,"('CHEMBL4649949', 'PERCENTEFFECT', '%')",1,False
6,A,PERCENTEFFECT,%,ORGANISM,50.0,0.779,quantitative,68613.0,934.0,CHEMBL4649971_PERCENTEFFECT_%_qt_50.0,"('CHEMBL4649971', 'PERCENTEFFECT', '%')",1,False
5,A,PERCENTEFFECT,%,ORGANISM,50.0,0.754,quantitative,53165.0,898.0,CHEMBL4649961_PERCENTEFFECT_%_qt_50.0,"('CHEMBL4649961', 'PERCENTEFFECT', '%')",1,False
1,A,IC50,umol.L-1,ORGANISM,20.0,0.759,quantitative,2468.0,209.0,CHEMBL4649949_IC50_umol.L-1_qt_20.0,"('CHEMBL4649949', 'IC50', 'umol.L-1')",1,False
3,A,IC50,umol.L-1,ORGANISM,20.0,0.761,quantitative,2466.0,207.0,CHEMBL4649948_IC50_umol.L-1_qt_20.0,"('CHEMBL4649948', 'IC50', 'umol.L-1')",1,False
7,A,IC50,umol.L-1,ORGANISM,5.0,0.704,mixed,2148.0,758.0,CHEMBL1738598_IC50_umol.L-1_mx_5.0,"('CHEMBL1738598', 'IC50', 'umol.L-1')",1,True
8,A,IC50,umol.L-1,ORGANISM,10.0,0.734,mixed,2041.0,945.0,CHEMBL1614183_IC50_umol.L-1_mx_10.0,"('CHEMBL1614183', 'IC50', 'umol.L-1')",1,True
2,A,IC50,umol.L-1,ORGANISM,10.0,0.762,mixed,1453.0,667.0,CHEMBL1738696_IC50_umol.L-1_mx_10.0,"('CHEMBL1738696', 'IC50', 'umol.L-1')",1,False
18,B,IC50,umol.L-1,ORGANISM,20.0,0.74,mixed,1933.0,1069.0,CHEMBL1614289_IC50_umol.L-1_mx_20.0,"('CHEMBL1614289', 'IC50', 'umol.L-1')",1,True


In [19]:
Counter(FINAL_DATASETS[FINAL_DATASETS['selected']]['label'])

Counter({'B': 8, 'M': 8, 'A': 3})

In [20]:
FINAL_COVERAGE = {i: set() for i in 'ABM'}
for label, assay_keys in FINAL_DATASETS[['label', 'assay_keys']].values:
    for assay_key in assay_keys.split(";"):
        assay_key = eval(assay_key)
        FINAL_COVERAGE[label].update(ASSAY_TO_COMPOUNDS[assay_key])

print(f"Final coverage A: {round(100* len(FINAL_COVERAGE['A']) / len(compounds), 1)}%")
print(f"Final coverage B: {round(100* len(FINAL_COVERAGE['B']) / len(compounds), 1)}%")
print(f"Final coverage M: {round(100* len(FINAL_COVERAGE['M']) / len(compounds), 1)}%")
print(f"Final coverage ALL: {round(100* len(FINAL_COVERAGE['A'].union(FINAL_COVERAGE['B']).union(FINAL_COVERAGE['M'])) / len(compounds), 1)}%")

Final coverage A: 65.3%
Final coverage B: 2.5%
Final coverage M: 20.0%
Final coverage ALL: 87.4%
