In [2]:
import pandas as pd
import os

In [3]:
# Get all pathogens i.e. {pathogen}_{target}
PATHOGENS = sorted(os.listdir(os.path.join("..", "data")))

# Load all correlations
correlations = pd.read_csv(os.path.join("../output/05_correlations/05_correlations.tsv"), sep='\t')

In [31]:
def keep_minimum_exps(report):

    pchembl_value, percentile, percentage_activity, grouped_percentiles = {}, {}, {}, {}

    for task in report['task'].tolist():

        if "_percentile_" in task:
            lab = task.split("_percentile_")[0]
            if lab not in percentile:
                percentile[lab] = []
            percentile[lab].append(int(task.split("_percentile_")[1].split("_")[0]))

        elif "_pchembl_value_" in task:
            lab = task.split("_pchembl_value_")[0]
            if lab not in pchembl_value:
                pchembl_value[lab] = []
            pchembl_value[lab].append(int(task.split("_pchembl_value_")[1].split("_")[0]))

        elif "_percentage_activity_" in task:
            lab = task.split("_percentage_activity_")[0]
            if lab not in percentage_activity:
                percentage_activity[lab] = []
            percentage_activity[lab].append(int(task.split("_percentage_activity_")[1].split("_")[0]))

        elif "_grouped_percentiles_" in task:
            lab = task.split("_grouped_percentiles_")[0]
            if lab not in grouped_percentiles:
                grouped_percentiles[lab] = []
            grouped_percentiles[lab].append(int(task.split("_grouped_percentiles_")[1].split("_")[0]))

        else:
            print("Warning! Missing assignment for task:", task)

    keep = []

    for task in report['task'].tolist():
        
        if "_percentile_" in task:
            lab = task.split("_percentile_")[0]
            motif = lab + "_percentile_" + str(int(min(percentile[lab]))) + "_"
        
        elif "_pchembl_value_" in task:
            lab = task.split("_pchembl_value_")[0]
            motif = lab + "_pchembl_value_" + str(int(min(pchembl_value[lab]))) + "_"

        elif "_percentage_activity_" in task:
            lab = task.split("_percentage_activity_")[0]
            motif = lab + "_percentage_activity_" + str(int(min(percentage_activity[lab]))) + "_"

        elif "_grouped_percentiles_" in task:
            lab = task.split("_grouped_percentiles_")[0]
            motif = lab + "_grouped_percentiles_" + str(int(min(grouped_percentiles[lab]))) + "_"

        else:
            print("Warning! Missing assignment for task:", task)
            motif = "Not found"

        if motif in task or motif == 'Not found':
            keep.append(True)
        else:
            keep.append(False)

    return keep

In [32]:
def get_label(report):
    
    labels = []

    for task in report['task'].tolist():
        if "percentile" in task:
            labels.append(3)
        elif "percentage" in task:
            labels.append(2)
        elif "pchembl_value" in task:
            labels.append(1)
        else:
            labels.append(4)

    return labels

In [33]:
# Prioritized datasets
prioritized = []

# For each pathogen
for pathogen in PATHOGENS:

    # Load dataset report
    PATH_TO_CAMT = f"/home/acomajuncosa/Documents/chembl-antimicrobial-tasks/output/{pathogen}/018_selected_tasks_FINAL.csv"
    report = pd.read_csv(PATH_TO_CAMT)

    # Only those having MOD > 0.7
    report = report[report["auroc_avg_MOD"] > 0.7].reset_index(drop=True)

    # Only those having > 1000 samples
    report = report[report['num_samples_MOD'] > 1000].reset_index(drop=True)

    # Only those having > 100 positives
    report = report[report['num_pos_samples'] > 100].reset_index(drop=True)

    # Keeping min percentile/perc_activity/pchembl_value/grouped_percentiles per assay/target
    keep = keep_minimum_exps(report)
    report['keep'] = keep
    report = report[report['keep'] == True].reset_index(drop=True)

    # Only those having at least 1 correlation > 0.5
    correlations_pathogen = correlations[(correlations['Pathogen1'] == pathogen) & 
                                         (correlations['Pathogen2'] == pathogen) &
                                         (correlations['Model1'] == "RF") & 
                                         (correlations['Model2'] == "RF") & 
                                         (correlations['Same task'] == False)].reset_index(drop=True)
    correlations_pathogen[correlations_pathogen['Spearman statistic'] > 0.5].reset_index(drop=True)
    corr = set(correlations_pathogen['Task1'].tolist() + correlations_pathogen['Task2'].tolist())
    corr = set(["_".join(i.split("_")[:-1]) for i in corr])
    report = report[report['task'].isin(corr)].reset_index(drop=True)

    # Give priority to each label
    labels = get_label(report)
    report['labels'] = labels

    # Sort by priority first, then by number of positive samples
    report = report.sort_values(by=['priority', 'labels', 'num_pos_samples'], ascending=[True, True, False]).reset_index(drop=True)

    # Store results
    TOP = 3
    prioritized.extend([[pathogen, i] for i in report['task'].tolist()[:TOP]])

prioritized = pd.DataFrame(prioritized, columns=['Pathogen', 'Task'])



In [41]:
pd.set_option('display.max_colwidth', 100)
prioritized

Unnamed: 0,Pathogen,Task
0,abaumannii_organism,1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_1_ORGANISM
1,abaumannii_organism,2_target_CHEMBL614425_mic_um_pchembl_value_5_ORGANISM
2,abaumannii_organism,2_target_CHEMBL614425_mic_um_pchembl_percentile_5_ORGANISM
3,calbicans_organism,1_assay_CHEMBL4296189_Inhibition_percentage_activity_percentile_5_ORGANISM
4,calbicans_organism,2_target_CHEMBL366_mic80_um_pchembl_value_6_ORGANISM
5,calbicans_organism,2_target_CHEMBL366_mic_um_pchembl_value_7_ORGANISM
6,ecoli_organism,1_assay_CHEMBL4296185_Inhibition_percentage_activity_percentile_5_ORGANISM
7,ecoli_organism,2_target_CHEMBL354_mic_um_pchembl_value_7_ORGANISM
8,ecoli_organism,2_target_CHEMBL354_ic50_um_pchembl_value_5_ORGANISM
9,efaecium_organism,2_target_CHEMBL357_mic_um_pchembl_value_6_ORGANISM


In [42]:
len(prioritized)

35

In [44]:
PATH_TO_CAMT = f"/home/acomajuncosa/Documents/chembl-antimicrobial-tasks/output/abaumannii_organism/018_selected_tasks_FINAL.csv"
report = pd.read_csv(PATH_TO_CAMT)
report

Unnamed: 0,task,auroc_avg_MOD,auroc_std_MOD,num_samples_MOD,num_pos_samples,pos:neg_MOD,auroc_avg_DIS,auroc_std_DIS,num_samples_DIS,pos:neg_DIS,SELECTED,RED,priority
0,1_assay_CHEMBL4296188_MIC_pchembl_percentile_1_ORGANISM,0.9324,0.1254,1032,10,0.0098,0.95,0.1,50,0.25,1,1,1
1,1_assay_CHEMBL4296188_MIC_pchembl_value_5_ORGANISM,0.8546,0.211,783,21,0.0276,0.8544,0.1823,105,0.25,1,1,1
2,1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_1_ORGANISM,0.8235,0.0511,21482,214,0.0101,0.7715,0.0255,1070,0.25,1,1,1
3,1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_5_ORGANISM,0.7834,0.0066,21482,1072,0.0525,0.8176,0.0066,5360,0.25,1,1,1
4,1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_10_ORGANISM,0.7168,0.0171,21482,2144,0.1109,0.8573,0.0137,10720,0.25,1,1,1
5,1_assay_CHEMBL4296188_MIC_pchembl_percentile_5_ORGANISM,0.7135,0.0603,1032,50,0.0509,0.688,0.0566,250,0.25,1,1,1
6,1_assay_CHEMBL4296188_Inhibition_percentage_activity_percentile_50_ORGANISM,0.6334,0.0038,21482,10744,1.0006,0.9305,0.0052,53720,0.25,2,1,1
7,1_assay_CHEMBL4296188_MIC_pchembl_percentile_10_ORGANISM,0.6023,0.0545,1032,103,0.1109,0.7157,0.0187,515,0.25,3,1,1
8,1_assay_CHEMBL4296193_Inhibition_percentage_activity_percentile_10_ORGANISM,0.5797,0.0351,1565,156,0.1107,0.787,0.0385,780,0.25,3,1,1
9,1_assay_CHEMBL4296193_Inhibition_percentage_activity_percentile_25_ORGANISM,0.5712,0.0242,1565,391,0.333,0.7913,0.0194,1955,0.25,3,1,1
