In [22]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [3]:
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]

pathogens = ["Mycobacterium tuberculosis"]
root = "."

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

In [17]:
MIN, PERC_1, PERC_25, PERC_50, PERC_75, PERC_99, MAX = [], [], [], [], [], [], []

In [None]:
# For each pathogen
for pathogen in pathogens:

    # Get pathogen code
    pathogen_code = get_pathogen_code(pathogen)

    # Get assay info
    ASSAYS_INFO = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, 'assays.csv'))

    # Load ChEMBL bioactivity data for that pathogen
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, f"{pathogen_code}_ChEMBL_data.csv"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL['compound_chembl_id']))}")

    # For each assay
    for assay_id, activity_type, unit in tqdm(zip(ASSAYS_INFO['assay_id'], ASSAYS_INFO['activity_type'], ASSAYS_INFO['unit'])):

        # Getting ChEMBL bioactivities
        if type(unit) == str:
            assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'] == unit)]["value"].astype(float).tolist()
        else:
            assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'].isna())]["value"].astype(float).tolist()
        
        # Calculate data
        min_ = round(np.min(assay_activities), 3)
        p1 = round(np.percentile(assay_activities, 1), 3)
        p25 = round(np.percentile(assay_activities, 25), 3)
        p50 = round(np.percentile(assay_activities, 50), 3)
        p75 = round(np.percentile(assay_activities, 75), 3)
        p99 = round(np.percentile(assay_activities, 99), 3)
        max_ = round(np.max(assay_activities), 3)

        # Store results
        MIN.append(min_)
        PERC_1.append(p1)
        PERC_25.append(p25)
        PERC_50.append(p50)
        PERC_75.append(p75)
        PERC_99.append(p99)
        MAX.append(max_)


Loading ChEMBL preprocessed data for mtuberculosis...
Number of activities for mtuberculosis: 714221
Number of compounds for mtuberculosis: 132378


11571it [14:50, 13.71it/s]

In [10]:
Counter([(i,j) for i,j in zip(ASSAYS_INFO['activity_type'], ASSAYS_INFO['unit'])])

Counter({('MIC', 'umol.L-1'): 4916,
         ('ACTIVITY', nan): 1193,
         ('INHIBITION', '%'): 1068,
         ('IC50', 'umol.L-1'): 877,
         ('MIC90', 'umol.L-1'): 610,
         ('RATIO', nan): 401,
         ('ACTIVITY', '%'): 381,
         ('GI', '%'): 351,
         ('MIC', nan): 303,
         ('ACTIVITY', 'log10[CFU]'): 241,
         ('MIC99', 'umol.L-1'): 240,
         ('KI', 'umol.L-1'): 225,
         ('LOG10CFU', nan): 205,
         ('FC', nan): 200,
         ('MIC50', 'umol.L-1'): 170,
         ('KD', 'umol.L-1'): 123,
         ('IC50', nan): 116,
         ('IC90', 'umol.L-1'): 101,
         ('LOG10CFUML', nan): 98,
         ('MIC>99', 'umol.L-1'): 90,
         ('GI', nan): 73,
         ('FICI', nan): 65,
         ('MIC90', nan): 63,
         ('MIC=>90', 'umol.L-1'): 54,
         ('MBC', 'umol.L-1'): 54,
         ('DELTATM', 'Cel'): 54,
         ('LOG(ACTIVITY)', nan): 52,
         ('RATIOIC50', nan): 50,
         ('FT>MIC', '%'): 48,
         ('MIC>90', 'umol.L-1'): 41

In [12]:
ASSAYS_INFO

Unnamed: 0,assay_id,assay_type,assay_organism,doc_chembl_id,target_type,target_chembl_id,target_organism,activity_type,unit,activities,nan_values,cpds
0,CHEMBL4649948,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,93555,0,86589
1,CHEMBL4649949,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,101515,0,86575
2,CHEMBL4649971,F,Mycobacterium tuberculosis,CHEMBL3988442,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,PERCENTEFFECT,%,68619,0,68613
3,CHEMBL4649972,F,Mycobacterium tuberculosis,CHEMBL3988442,PROTEIN COMPLEX,CHEMBL4662931,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,68616,0,68610
4,CHEMBL4649941,F,Mycobacterium tuberculosis,CHEMBL3988442,SINGLE PROTEIN,CHEMBL4662928,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,67381,0,66941
...,...,...,...,...,...,...,...,...,...,...,...,...
13582,CHEMBL5126010,B,Mycobacterium tuberculosis,CHEMBL5120950,SINGLE PROTEIN,CHEMBL5169231,Mycobacterium tuberculosis,INHIBITION,%,1,1,1
13583,CHEMBL5126009,B,Mycobacterium tuberculosis,CHEMBL5120950,SINGLE PROTEIN,CHEMBL5169231,Mycobacterium tuberculosis,INHIBITION,%,1,1,1
13584,CHEMBL5126008,B,Mycobacterium tuberculosis,CHEMBL5120950,SINGLE PROTEIN,CHEMBL5169231,Mycobacterium tuberculosis,INHIBITION,%,1,1,1
13585,CHEMBL5126007,B,Mycobacterium tuberculosis,CHEMBL5120950,SINGLE PROTEIN,CHEMBL5169231,Mycobacterium tuberculosis,INHIBITION,%,1,1,1
