In [90]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

data_dir = "../scripts/dest"
pathogen_code = "abaumannii"
tasks_dir = os.path.join(data_dir, pathogen_code, "02_raw_tasks")

dm = pd.read_csv(os.path.join(data_dir, pathogen_code, "04_modelability.csv"))
ratios = []
priorities = []
for r in dm.iterrows():
    r = r[1]
    ratios += [r["num_pos_samples"] / r["num_samples"]]
    priorities += [int(r["task"][0])]
dm["ratio"] = ratios
dm["priority"] = priorities
modelable_tasks = set(dm[(dm["auroc_avg"] >= 0.7) & (dm["ratio"] <= 0.5)]["task"])


In [91]:
positive_sets = {}
for task in os.listdir(tasks_dir):
    fname = task[:-4]
    if fname not in modelable_tasks:
        continue
    df = pd.read_csv(os.path.join(tasks_dir, task))
    columns = list(df.columns)
    c = columns[-1]
    inchikeys = df[df[c] == 1]["inchikey"].tolist()
    positive_sets[fname] = set(inchikeys)


In [92]:
def positive_overlaps(positive_sets):
    tasks = sorted(list(positive_sets.keys()))
    R = []
    for task1 in tqdm(tasks):
        for task2 in tasks:
            if task1 >= task2:
                continue
            o = len(positive_sets[task1].intersection(positive_sets[task2]))
            n1 = len(positive_sets[task1])
            n2 = len(positive_sets[task2])
            oi = o / min(n1, n2)
            t = o / (len(positive_sets[task1].union(positive_sets[task2])))
            p1 = int(task1[0])
            p2 = int(task2[0])
            dm_ = dm[dm["task"] == task1]
            for v in dm_.values:
                auroc1 = v[1]
                n_total1 = v[3]
                break
            dm_ = dm[dm["task"] == task2]
            for v in dm_.values:
                auroc2 = v[1]
                n_total2 = v[3]
                break
            r = [task1, task2, n1, n2, o, oi, t, p1, p2, auroc1, auroc2, n_total1, n_total2]
            R += [r]
    return pd.DataFrame(R, columns=["task1", "task2", "n1", "n2", "overlap", "overlap_index", "jaccard_index", "priority1", "priority2", "auroc1", "auroc2", "n_total1", "n_total2"])

dp = positive_overlaps(positive_sets)
dp = dp.sort_values("jaccard_index", ascending=False)

to_remove = set()
for r in dp[dp["jaccard_index"] > 0.8].iterrows():
    r = r[1]
    if r["priority1"] < r["priority2"]:
        to_remove.add(r["task2"])
    elif r["priority1"] > r["priority2"]:
        to_remove.add(r["task1"])
    else:
        if r["n_total1"] > r["n_total2"]*1.25:
            to_remove.add(r["task2"])
        elif r["n_total2"] > r["n_total1"]*1.25:
            to_remove.add(r["task1"])
        else:
            if r["auroc1"] < r["auroc2"]:
                to_remove.add(r["task2"])
            else:
                to_remove.add(r["task1"])

to_remove = list(to_remove)

valid_tasks = modelable_tasks - set(to_remove)
valid_tasks = sorted(list(valid_tasks))

lb = np.percentile(dm["num_pos_samples"], 10)
ub = np.percentile(dm["num_pos_samples"], 90)

dm = dm[dm["task"].isin(valid_tasks)]

dm = dm[dm["num_pos_samples"] >= lb]
dm = dm[dm["num_pos_samples"] <= ub]

dm.sort_values(by = ["priority", "auroc_avg"], ascending=[True, False])

100%|██████████| 49/49 [00:00<00:00, 142.96it/s]


Unnamed: 0,task,auroc_avg,auroc_std,num_samples,num_pos_samples,ratio,priority
12,1_assay_CHEMBL4296188_MIC_pchembl_value_5,0.923613,0.083607,783,21,0.02682,1
40,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,0.817319,0.029191,21482,214,0.009962,1
35,1_assay_CHEMBL4296188_MIC_pchembl_percentile_5,0.779588,0.062568,1032,50,0.04845,1
47,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,0.775595,0.018438,21482,1072,0.049902,1
63,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,0.722768,0.010079,21482,2144,0.099804,1
42,2_target_CHEMBL614425_activity_%_percentage_ac...,0.962245,0.041273,106,36,0.339623,2
23,2_target_CHEMBL614425_activity_%_percentage_ac...,0.933333,0.070229,100,46,0.46,2
38,2_target_CHEMBL614425_mic_um_pchembl_percentil...,0.926135,0.009381,6139,583,0.094967,2
28,2_target_CHEMBL614425_mic_um_pchembl_value_5,0.924645,0.008123,5924,1416,0.239028,2
4,2_target_CHEMBL614425_mic_um_pchembl_percentil...,0.924007,0.007402,6139,2726,0.444046,2


In [96]:
# remove the very high percentiles (e.g. 50) if there are others
import collections
to_remove = []

percentile_names = collections.defaultdict(list)
for fname in dm["task"].tolist():
    if "_percentile_" in fname:
        agg_name = fname.split("_percentile_")[0]
        value = int(fname.split("_percentile_")[1])
        percentile_names[agg_name] += [(fname, value)]

for k,v in percentile_names.items():
    if len(v) > 1:
        for x in v:
            if x[1] == 50:
                to_remove += [x[0]]

to_remove = set(to_remove)
dm = dm[~dm["task"].isin(to_remove)]



In [104]:
to_remove = []

percentage_activity_names = collections.defaultdict(list)
for fname in dm["task"].tolist():
    if "_percentile_" in fname:
        continue
    if "_percentage_activity_" in fname:
        agg_name = fname.split("_percentage_activity_")[0]
        value = int(fname.split("_percentage_activity_")[1])
        percentage_activity_names[agg_name] += [(fname, value)]

for k,v in percentage_activity_names.items():
    if len(v) > 1:
        for x in v:
            if x[1] == 50:
                to_remove += [x[0]]

to_remove = set(to_remove)
dm = dm[~dm["task"].isin(to_remove)]

In [105]:
dm

Unnamed: 0,task,auroc_avg,auroc_std,num_samples,num_pos_samples,ratio,priority
0,4_all_percentage_activity_percentile_5,0.807591,0.016325,23392,1059,0.045272,4
11,2_target_CHEMBL614425_inhibition_%_percentage_...,0.898196,0.061765,23282,32,0.001374,2
12,1_assay_CHEMBL4296188_MIC_pchembl_value_5,0.923613,0.083607,783,21,0.02682,1
13,4_all_percentage_activity_percentile_1,0.873007,0.007251,23392,143,0.006113,4
15,4_all_percentage_activity_90,0.87817,0.062886,23393,44,0.001881,4
16,5_grouped_percentiles_1,0.783655,0.046867,26886,526,0.019564,5
18,2_target_CHEMBL614425_inhibition_%_percentage_...,0.802013,0.035747,23280,211,0.009064,2
19,2_target_CHEMBL614425_mic_um_pchembl_percentile_5,0.907628,0.016619,6139,292,0.047565,2
24,2_target_CHEMBL614425_mic_um_pchembl_percentile_1,0.891568,0.070901,6139,62,0.010099,2
25,2_target_CHEMBL614425_mic_um_pchembl_value_7,0.8597,0.049562,6143,86,0.014,2


In [108]:
dm = dm.sort_values(by = ["priority", "auroc_avg"], ascending=[True, False]).head(25)
dm.to_csv(os.path.join(data_dir, pathogen_code, "05_selected_tasks.csv"), index=False)

Unnamed: 0,task,auroc_avg,auroc_std,num_samples,num_pos_samples,ratio,priority
12,1_assay_CHEMBL4296188_MIC_pchembl_value_5,0.923613,0.083607,783,21,0.02682,1
40,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,0.817319,0.029191,21482,214,0.009962,1
35,1_assay_CHEMBL4296188_MIC_pchembl_percentile_5,0.779588,0.062568,1032,50,0.04845,1
47,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,0.775595,0.018438,21482,1072,0.049902,1
63,1_assay_CHEMBL4296188_Inhibition_percentage_ac...,0.722768,0.010079,21482,2144,0.099804,1
42,2_target_CHEMBL614425_activity_%_percentage_ac...,0.962245,0.041273,106,36,0.339623,2
38,2_target_CHEMBL614425_mic_um_pchembl_percentil...,0.926135,0.009381,6139,583,0.094967,2
28,2_target_CHEMBL614425_mic_um_pchembl_value_5,0.924645,0.008123,5924,1416,0.239028,2
39,2_target_CHEMBL614425_mic_um_pchembl_value_8,0.919047,0.067513,6139,35,0.005701,2
19,2_target_CHEMBL614425_mic_um_pchembl_percentile_5,0.907628,0.016619,6139,292,0.047565,2


In [95]:
percentile_names


defaultdict(list,
            {'4_all_percentage_activity': [('4_all_percentage_activity_percentile_5',
               5),
              ('4_all_percentage_activity_percentile_1', 1)],
             '2_target_CHEMBL614425_mic_um_pchembl': [('2_target_CHEMBL614425_mic_um_pchembl_percentile_50',
               50),
              ('2_target_CHEMBL614425_mic_um_pchembl_percentile_5', 5),
              ('2_target_CHEMBL614425_mic_um_pchembl_percentile_1', 1),
              ('2_target_CHEMBL614425_mic_um_pchembl_percentile_10', 10)],
             '2_target_CHEMBL614425_inhibition_%_percentage_activity': [('2_target_CHEMBL614425_inhibition_%_percentage_activity_percentile_1',
               1)],
             '1_assay_CHEMBL4296188_MIC_pchembl': [('1_assay_CHEMBL4296188_MIC_pchembl_percentile_5',
               5)],
             '1_assay_CHEMBL4296188_Inhibition_percentage_activity': [('1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_1',
               1),
              ('1_assa