In [1]:
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import re

In [2]:
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]

pathogens = ["Mycobacterium tuberculosis"]
pathogens = ["Plasmodium falciparum"]
root = "."

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Get directions
directions = pd.read_csv(os.path.join(root, "..", "config", 'manual_curation', "activity_std_units_curated_manual_curation.csv"))
directions = {(i,j): k for i,j,k in zip(directions['activity_type'], directions['unit'], directions['manual_curation'])}

In [14]:
BIN = []

# For each pathogen
for pathogen in pathogens:

    # Get pathogen code
    pathogen_code = get_pathogen_code(pathogen)

    # Get assay info
    ASSAYS_INFO = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, 'assays.csv'))
    ASSAYS_INFO = ASSAYS_INFO[["assay_id", "target_type", "activity_type", "unit", "activities", "nan_values", "cpds"]]
    # ASSAYS_INFO = ASSAYS_INFO[ASSAYS_INFO['cpds'] >= 0]

    # Load ChEMBL bioactivity data for that pathogen
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, f"{pathogen_code}_ChEMBL_data.csv"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL['compound_chembl_id']))}")

    # For each assay
    for assay_id, activity_type, unit in tqdm(zip(ASSAYS_INFO['assay_id'], ASSAYS_INFO['activity_type'], ASSAYS_INFO['unit'])):

        # Getting ChEMBL bioactivities
        if type(unit) == str:
            assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'] == unit)]["value"].astype(float).tolist()
        else:
            assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'].isna())]["value"].astype(float).tolist()

        # Remove nans
        assay_activities = [i for i in assay_activities if np.isnan(i) == False]
        if len(assay_activities) == 0:
            assay_activities = [np.nan]
        
        # Calculate data
        min_ = round(np.min(assay_activities), 3)
        p1 = round(np.percentile(assay_activities, 1), 3)
        p5 = round(np.percentile(assay_activities, 5), 3)
        p10 = round(np.percentile(assay_activities, 10), 3)
        p25 = round(np.percentile(assay_activities, 25), 3)
        p50 = round(np.percentile(assay_activities, 50), 3)
        p75 = round(np.percentile(assay_activities, 75), 3)
        p90 = round(np.percentile(assay_activities, 90), 3)
        p95 = round(np.percentile(assay_activities, 95), 3)
        p99 = round(np.percentile(assay_activities, 99), 3)
        max_ = round(np.max(assay_activities), 3)

        # Get direction
        direction = directions[(activity_type, unit)]

        # Store results
        BIN.append([min_, p1, p5, p10, p25, p50, p75, p90, p95, p99, max_, direction])

# To pd df
BIN = pd.DataFrame(BIN, columns=["min", "p1", "p5", "p10", "p25", "p50", "p75", "p90", "p95", "p99", "max", "direction"])
PERCENTILES = pd.concat([ASSAYS_INFO, BIN], axis=1)

# Save results
PERCENTILES.to_csv(os.path.join(root, "..", "output", pathogen_code, 'assays_activity_ranges.csv'), index=False)

Loading ChEMBL preprocessed data for pfalciparum...
Number of activities for pfalciparum: 1079517
Number of compounds for pfalciparum: 498660


7184it [13:39,  8.77it/s]


KeyboardInterrupt: 

In [12]:
assay_activities

[]

In [8]:
PERCENTILES

Unnamed: 0,assay_id,target_type,activity_type,unit,activities,nan_values,cpds,min,p1,p5,p10,p25,p50,p75,p90,p95,p99,max,direction
0,CHEMBL1794345,ORGANISM,POTENCY,umol.L-1,170312,0,169986,0.0,0.003,0.131,0.738,3.294,10.418,14.716,18.526,18.526,20.786,36.964,-1.0
1,CHEMBL4888485,ORGANISM,ZSCORE,,147589,0,147429,-21.25,-4.4,-1.81,-1.03,-0.14,0.66,1.36,1.98,2.38,3.47,16.34,0.0
2,CHEMBL4888485,ORGANISM,INHIBITION,%,147589,0,147429,-87.0,-26.0,-17.0,-14.0,-10.0,-5.0,1.0,9.0,22.0,39.0,105.0,1.0
3,CHEMBL1794580,ORGANISM,POTENCY,umol.L-1,131037,0,130745,0.0,0.004,0.294,1.169,4.148,10.418,13.115,18.526,18.526,20.786,41.474,-1.0
4,CHEMBL4649943,SINGLE PROTEIN,PERCENTEFFECT,%,68619,0,68613,-52.1,-6.561,-3.338,-1.944,-0.039,1.995,4.327,6.626,8.163,12.25,100.0,1.0
5,CHEMBL4513221,ORGANISM,INHIBITION,%,68570,0,68570,0.04,0.89,1.0,1.03,1.07,1.12,1.17,1.23,1.27,1.39,2.91,1.0
6,CHEMBL4513220,ORGANISM,INHIBITION,%,64767,0,64767,-617.0,-208.0,-153.0,-129.0,-91.9,-52.5,-17.9,9.834,35.2,76.4,95.8,1.0
7,CHEMBL4649964,ORGANISM,PERCENTEFFECT,%,37614,0,37531,-657.4,-210.505,-154.894,-131.647,-96.85,-58.435,-23.73,2.565,23.83,74.125,94.9,1.0
8,CHEMBL4649945,ORGANISM,PERCENTEFFECT,%,37095,0,33697,-3952.56,-41.811,-21.203,-15.2,-7.01,0.32,6.684,12.79,18.16,63.951,118.25,1.0
9,CHEMBL1054502,ORGANISM,INHIBITION,%,13533,0,13467,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,6.0,7.0,10.0,1.0


In [6]:
# Count repetitions of activity_type, unit
COUNTS = PERCENTILES.groupby(["activity_type", "unit"]).size().reset_index(name="count").sort_values('count', ascending=False).reset_index(drop=True)
SUMMARY = []

# For each pair activity_type, unit
for activity_type, unit in zip(COUNTS['activity_type'], COUNTS['unit']):

    # Get direction
    direction = directions[(activity_type, unit)]

    # Get data per assay
    if type(unit) == str:
        df = PERCENTILES[(PERCENTILES['activity_type'] == activity_type) & (PERCENTILES['unit'] == unit)]
    else:
        df = PERCENTILES[(PERCENTILES['activity_type'] == activity_type) & (PERCENTILES['unit'].isna())]

    # Get only specific thresholds
    df = df[["min", "p1", "p5", "p10", "p90", "p95", "p99", "max"]]
    summary = []
    for i,j,k in zip(df.quantile(0.1, axis=0), df.quantile(0.5, axis=0), df.quantile(0.9, axis=0)):
        i,j,k = str(round(i, 3)), str(round(j, 3)), str(round(k, 3))
        summary.append(" | ".join([i,j,k]))
    
    # Append direction and store summary
    summary.append(direction)
    SUMMARY.append(summary)

# Concatenate with counts
SUMMARY = pd.DataFrame(SUMMARY, columns=["min", "p1", "p5", "p10", "p90", "p95", "p99", "max", "direction"])
COUNTS = pd.concat([COUNTS, SUMMARY], axis=1)

# Save results
COUNTS.to_csv(os.path.join(root, "..", "output", pathogen_code, 'stats_activity_ranges.csv'), index=False)

In [56]:
SMILES = ['NNC(=O)c1ccncc1' , # Isonaizid
          "CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(O)c(c(/C=N/N5CCN(C)CC5)c(O)c4c3C2=O)NC(=O)/C(C)=C\C=C\[C@H](C)[C@H](O)[C@@H](C)[C@@H](O)[C@@H](C)[C@H](OC(C)=O)[C@@H]1C",  # Rifampin
          "NC(=O)c1cnccn1",  # Pyrazinamide
          "CC[C@@H](CO)NCCN[C@@H](CC)CO"]  # Ethambutol

a, b = "INHIBITION", "%"

for smi in SMILES:

        values = ChEMBL[(ChEMBL['canonical_smiles'] == smi) & (ChEMBL['activity_type'] == a) & 
                        (ChEMBL['unit'] == b) & (ChEMBL['value'].isna() == False) & ((ChEMBL['relation'] == "=") | (ChEMBL["relation"] == '>'))]['value'].tolist()

        print(f"Number of activities: {len(values)}")
        print(np.percentile(values, 10))
        print(np.percentile(values, 50))
        print(np.percentile(values, 90))

Number of activities: 72
64.47
99.0
100.0
Number of activities: 76
65.5
98.0
100.0
Number of activities: 9
30.8
99.0
100.0
Number of activities: 14
18.6
89.5
99.0
