In [1]:
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import re

In [2]:
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]

pathogens = ["Mycobacterium tuberculosis"]
root = "."

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Get directions
directions = pd.read_csv(os.path.join(root, "..", "config", 'manual_curation', "activity_std_units_curated_manual_curation.csv"))
directions = {(i,j): k for i,j,k in zip(directions['activity_type'], directions['unit'], directions['manual_curation'])}

In [None]:
BIN = []

# For each pathogen
for pathogen in pathogens:

    # Get pathogen code
    pathogen_code = get_pathogen_code(pathogen)

    # Get assay info
    ASSAYS_INFO = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, 'assays.csv'))
    ASSAYS_INFO = ASSAYS_INFO[["assay_id", "target_type", "activity_type", "unit", "activities", "nan_values", "cpds"]]
    ASSAYS_INFO = ASSAYS_INFO[ASSAYS_INFO['cpds'] >= 100]

    # Load ChEMBL bioactivity data for that pathogen
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, f"{pathogen_code}_ChEMBL_data.csv"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL['compound_chembl_id']))}")

    # For each assay
    for assay_id, activity_type, unit in tqdm(zip(ASSAYS_INFO['assay_id'], ASSAYS_INFO['activity_type'], ASSAYS_INFO['unit'])):

        # Getting ChEMBL bioactivities
        if type(unit) == str:
            assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'] == unit)]["value"].astype(float).tolist()
        else:
            assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'].isna())]["value"].astype(float).tolist()
        
        # Calculate data
        min_ = round(np.min(assay_activities), 3)
        p1 = round(np.percentile(assay_activities, 1), 3)
        p5 = round(np.percentile(assay_activities, 5), 3)
        p10 = round(np.percentile(assay_activities, 10), 3)
        p25 = round(np.percentile(assay_activities, 25), 3)
        p50 = round(np.percentile(assay_activities, 50), 3)
        p75 = round(np.percentile(assay_activities, 75), 3)
        p90 = round(np.percentile(assay_activities, 90), 3)
        p95 = round(np.percentile(assay_activities, 95), 3)
        p99 = round(np.percentile(assay_activities, 99), 3)
        max_ = round(np.max(assay_activities), 3)

        # Get direction
        direction = directions[(activity_type, unit)]

        # Store results
        BIN.append([min_, p1, p5, p10, p25, p50, p75, p90, p95, p99, max_, direction])

# To pd df
BIN = pd.DataFrame(BIN, columns=["min", "p1", "p5", "p10", "p25", "p50", "p75", "p90", "p95", "p99", "max", "direction"])
PERCENTILES = pd.concat([ASSAYS_INFO, BIN], axis=1)

# Save results
PERCENTILES.to_csv(os.path.join(root, "..", "output", pathogen_code, 'assays_activity_ranges.csv'), index=False)

Loading ChEMBL preprocessed data for mtuberculosis...


In [None]:
# Count repetitions of activity_type, unit
COUNTS = PERCENTILES.groupby(["activity_type", "unit"]).size().reset_index(name="count").sort_values('count', ascending=False).reset_index(drop=True)
SUMMARY = []

# For each pair activity_type, unit
for activity_type, unit in zip(COUNTS['activity_type'], COUNTS['unit']):

    # Get direction
    direction = directions[(activity_type, unit)]

    # Get data per assay
    if type(unit) == str:
        df = PERCENTILES[(PERCENTILES['activity_type'] == activity_type) & (PERCENTILES['unit'] == unit)]
    else:
        df = PERCENTILES[(PERCENTILES['activity_type'] == activity_type) & (PERCENTILES['unit'].isna())]

    # Get only specific thresholds
    df = df[["min", "p1", "p5", "p10", "p90", "p95", "p99", "max"]]
    summary = []
    for i,j,k in zip(df.quantile(0.1, axis=0), df.quantile(0.5, axis=0), df.quantile(0.9, axis=0)):
        i,j,k = str(round(i, 3)), str(round(j, 3)), str(round(k, 3))
        summary.append(" | ".join([i,j,k]))
    
    # Append direction and store summary
    summary.append(direction)
    SUMMARY.append(summary)

# Concatenate with counts
SUMMARY = pd.DataFrame(SUMMARY, columns=["min", "p1", "p5", "p10", "p90", "p95", "p99", "max", "direction"])
COUNTS = pd.concat([COUNTS, SUMMARY], axis=1)

# Save results
COUNTS.to_csv(os.path.join(root, "..", "output", pathogen_code, 'stats_activity_ranges.csv'), index=False)

In [7]:
PERCENTILES

Unnamed: 0,assay_id,target_type,activity_type,unit,activities,nan_values,cpds,min,p1,p5,p10,p25,p50,p75,p90,p95,p99,max,direction
0,CHEMBL4649948,UNCHECKED,PERCENTEFFECT,%,93555,0,86589,-1122.89,-40.220,-25.983,-19.980,-10.670,-1.728,7.254,16.650,24.430,57.080,120.27,1.0
1,CHEMBL4649949,UNCHECKED,PERCENTEFFECT,%,101515,0,86575,-1111.40,-48.489,-29.920,-21.220,-8.404,1.521,12.740,25.580,35.500,64.117,133.09,1.0
2,CHEMBL4649971,ORGANISM,PERCENTEFFECT,%,68619,0,68613,-303.60,-47.290,-30.830,-24.070,-13.540,-2.847,6.803,16.180,23.931,62.017,176.21,1.0
3,CHEMBL4649972,PROTEIN COMPLEX,PERCENTEFFECT,%,68616,0,68610,-4329.36,-46.064,-22.760,-16.560,-9.429,-3.116,2.930,9.035,12.840,20.767,97.51,1.0
4,CHEMBL4649941,SINGLE PROTEIN,PERCENTEFFECT,%,67381,0,66941,-1352.38,-28.038,-9.375,-5.263,-1.196,1.873,4.893,8.418,11.020,17.760,101.82,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,CHEMBL4011042,ORGANISM,GI,%,103,0,103,99.00,99.000,99.000,99.000,99.000,99.000,99.000,99.000,99.000,99.000,99.00,1.0
66,CHEMBL4011010,ORGANISM,MIC50,umol.L-1,103,0,103,2.00,2.000,3.000,3.000,4.000,8.000,14.750,50.000,50.000,50.000,50.00,-1.0
67,CHEMBL4384698,UNCHECKED,IC50,umol.L-1,104,0,102,0.04,0.050,0.052,0.079,0.200,1.995,39.811,100.000,100.000,100.000,100.00,-1.0
68,CHEMBL1948971,SINGLE PROTEIN,INHIBITION,%,100,0,100,0.00,0.000,0.000,0.000,7.000,12.500,24.500,39.100,41.050,49.140,63.00,1.0


In [38]:
df = PERCENTILES[PERCENTILES['activity_type'] == 'INHIBITION']

In [39]:
df

Unnamed: 0,assay_id,target_type,activity_type,unit,activities,nan_values,cpds,min,p1,p5,p10,p25,p50,p75,p90,p95,p99,max,direction
18,CHEMBL2094261,ORGANISM,INHIBITION,%,399,0,399,-14.0,-6.706,-1.592,0.746,6.81,12.0,18.85,28.56,38.65,99.804,102.0,1.0
19,CHEMBL2094262,ORGANISM,INHIBITION,%,399,0,399,-15.4,-4.78,-0.392,2.54,7.625,14.8,31.25,79.18,96.2,98.812,101.0,1.0
29,CHEMBL4388638,ORGANISM,INHIBITION,%,146,0,146,0.0,0.0,0.0,0.0,0.0,23.0,60.5,78.5,84.5,92.65,95.0,1.0
68,CHEMBL1948971,SINGLE PROTEIN,INHIBITION,%,100,0,100,0.0,0.0,0.0,0.0,7.0,12.5,24.5,39.1,41.05,49.14,63.0,1.0
69,CHEMBL1948972,SINGLE PROTEIN,INHIBITION,%,100,0,100,0.0,0.0,0.0,0.0,0.0,0.0,19.0,32.2,39.15,49.21,70.0,1.0


In [37]:
len(df)

12

In [36]:
len(df[df['p5'] <= 10])

5

In [26]:
np.quantile(df['min'], 0.5111111111111111)

-776.6397777777779

In [17]:
df.quantile(0.1, axis=0, numeric_only=True)

activities    44304.2000
nan_values        0.0000
cpds          44300.2000
min           -1947.7760
p1              -54.6720
p5              -36.8580
p10             -27.3320
p25             -14.7300
p50              -4.3580
p75               1.5024
p90               5.2566
p95               8.3372
p99              16.7900
max              91.0720
direction         1.0000
Name: 0.1, dtype: float64

In [10]:
COUNTS

Unnamed: 0,activity_type,unit,count,min,p1,p5,p10,p90,p95,p99,max,direction
0,MIC,umol.L-1,26,0.016 | 0.02 | 0.425,0.02 | 0.03 | 0.655,0.036 | 0.275 | 3.28,0.068 | 0.6 | 6.56,1.64 | 50.0 | 164.362,2.9 | 50.0 | 164.486,7.78 | 50.0 | 167.786,15.5 | 50.0 | 167.969,-1.0
1,IC50,umol.L-1,12,0.054 | 0.796 | 6.942,0.078 | 2.484 | 89.223,0.084 | 11.468 | 93.02,0.137 | 18.527 | 93.02,12.01 | 100.0 | 470.567,12.089 | 100.0 | 470.723,14.01 | 100.0 | 501.187,14.02 | 100.0 | 501.187,-1.0
2,PERCENTEFFECT,%,9,-1947.776 | -822.79 | -51.31,-54.672 | -46.064 | -9.491,-36.858 | -25.983 | -6.547,-27.332 | -19.98 | -5.114,5.257 | 9.035 | 25.89,8.337 | 14.25 | 35.205,16.79 | 35.676 | 62.437,91.072 | 120.27 | 198.592,1.0
3,MIC90,umol.L-1,6,0.04 | 0.07 | 1.6,0.04 | 0.132 | 2.16,0.29 | 0.59 | 2.987,0.814 | 1.55 | 3.7,8.93 | 50.0 | 50.0,9.413 | 50.0 | 50.0,9.836 | 50.0 | 50.0,9.886 | 50.0 | 75.0,-1.0
4,INHIBITION,%,5,-14.84 | 0.0 | 0.0,-5.936 | 0.0 | 0.0,-1.112 | 0.0 | 0.0,0.0 | 0.0 | 1.822,30.016 | 39.1 | 78.908,38.85 | 41.05 | 91.52,49.168 | 92.65 | 99.407,65.8 | 95.0 | 101.6,1.0
5,AC50,umol.L-1,4,0.03 | 0.067 | 0.094,0.131 | 0.383 | 0.843,0.372 | 1.654 | 4.304,0.959 | 3.13 | 9.322,38.593 | 240.0 | 380.0,39.729 | 240.0 | 380.0,42.753 | 240.0 | 380.0,43.328 | 240.0 | 380.0,-1.0
6,GI,%,2,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,99.0 | 99.0 | 99.0,1.0
7,MIC>99,umol.L-1,2,0.03 | 0.03 | 0.03,0.106 | 0.106 | 0.106,0.55 | 0.75 | 0.95,1.0 | 1.0 | 1.0,250.0 | 250.0 | 250.0,250.0 | 250.0 | 250.0,250.0 | 250.0 | 250.0,250.0 | 250.0 | 250.0,-1.0
8,%CONTROL,%,1,62.5 | 62.5 | 62.5,64.071 | 64.071 | 64.071,77.0 | 77.0 | 77.0,78.0 | 78.0 | 78.0,100.0 | 100.0 | 100.0,100.0 | 100.0 | 100.0,100.0 | 100.0 | 100.0,100.0 | 100.0 | 100.0,
9,EC50,umol.L-1,1,0.018 | 0.018 | 0.018,0.074 | 0.074 | 0.074,0.297 | 0.297 | 0.297,0.618 | 0.618 | 0.618,208.1 | 208.1 | 208.1,380.0 | 380.0 | 380.0,380.0 | 380.0 | 380.0,380.0 | 380.0 | 380.0,-1.0
