In [None]:
from collections import defaultdict
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Loading ChEMBL preprocessed data
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv(os.path.join(DATAPATH, "chembl_processed", "activities_preprocessed.csv"), low_memory=False)
print(f"Original size: {len(ChEMBL)}")

# List of pathogens
# pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
#              "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
#              "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]
pathogens = ["Acinetobacter baumannii", "Mycobacterium tuberculosis", "Klebsiella pneumoniae"][1:2]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

Loading ChEMBL preprocessed data...
Original size: 24040987


In [18]:
def get_pathogen_data(ChEMBL, pathogen, manual_assays):
    ChEMBL_pathogen = ChEMBL[ChEMBL['target_organism'].str.contains(pathogen, case=False, na=False) | 
                             ChEMBL['assay_organism'].str.contains(pathogen, case=False, na=False) |
                             ChEMBL['assay_chembl_id'].isin(manual_assays)].reset_index(drop=True)
    return ChEMBL_pathogen

def only_one(values, name):
    if len(values) != 1:
        raise ValueError(f"Expected exactly one {name}, found {values}")
    return values[0]

In [24]:
# For each pathogen
for pathogen in pathogens[:1]:

    # Define variables
    print(f"Filtering for pathogen: {pathogen}...")
    pathogen_code = get_pathogen_code(pathogen)
    PATH_TO_OUTPUT = os.path.join(root, "..", "output", pathogen_code)
    os.makedirs(PATH_TO_OUTPUT, exist_ok=True)

    # Read manual assays provided by the user
    manual_assays = open(os.path.join(CONFIGPATH, 'assays', f"{pathogen_code}.csv"), "r").read()
    manual_assays = set([j for i in manual_assays.split("\n") for j in i.split(",")])

    # Get pathogen data
    ChEMBL_pathogen = get_pathogen_data(ChEMBL, pathogen, manual_assays)
    ChEMBL_pathogen.to_csv(os.path.join(PATH_TO_OUTPUT, f"{pathogen_code}_ChEMBL_raw_data.csv.gz"), index=False)
    print(f"Number of activities: {len(ChEMBL_pathogen)}")
    print(f"Number of unique compounds: {len(set(ChEMBL_pathogen['compound_chembl_id']))}")

    # Get organism metadata
    df = dict(Counter(ChEMBL_pathogen['target_organism']))
    df = pd.DataFrame([[i, df[i]] for i in sorted(df, key = lambda x: df[x], reverse=True)], columns=['organism', 'count'])
    df.to_csv(os.path.join(PATH_TO_OUTPUT, "target_organism_counts.csv"), index=False)

    # Get compound metadata and counts
    compound_info = pd.read_csv(os.path.join(DATAPATH, "chembl_processed", "compound_info.csv"), low_memory=False)
    ik_dict = dict(zip(compound_info["chembl_id"], compound_info["standard_inchi_key"]))
    pair_counts = ChEMBL_pathogen[['compound_chembl_id', 'canonical_smiles']].value_counts().reset_index(name='count')
    pair_counts["InChIKey"] = pair_counts["compound_chembl_id"].map(ik_dict)
    cols = ["compound_chembl_id", 'InChIKey', 'canonical_smiles', 'count']
    pair_counts[cols].to_csv(os.path.join(PATH_TO_OUTPUT, "compound_counts.csv.gz"), index=False)

    # Get unique assays
    assays = sorted(set(ChEMBL_pathogen['assay_chembl_id']))

Filtering for pathogen: Mycobacterium tuberculosis...
Number of activities: 727897
Number of unique compounds: 138543


In [25]:
# Get assay to index mapping
assay_to_idx = defaultdict(list)
for i, assay_id in enumerate(ChEMBL_pathogen["assay_chembl_id"].to_numpy()):
    assay_to_idx[assay_id].append(i)

ASSAYS_INFO = []
print("Collecting individual assay information...")

Collecting individual assay information...


In [26]:
# For each assay
for assay in tqdm(assays):

    # Get subset of assay data
    df_ = ChEMBL_pathogen.iloc[assay_to_idx[assay]]
    
    # Get values
    assay_type = list(set(df_['assay_type']))
    target_type = list(set(df_['target_type']))
    target_chembl_id = list(set(df_['target_chembl_id']))
    activity_types = list(set(df_['activity_type']))  # may be more than one
    target_organism = list(set(df_['target_organism']))
    assay_organism = list(set(df_['assay_organism']))
    doc_chembl_id = list(set(df_['doc_chembl_id']))

    # Check coherence
    assay_type = only_one(assay_type, "assay_type")
    target_type = only_one(target_type, "target_type")
    target_chembl_id = only_one(target_chembl_id, "target_chembl_id")
    target_organism = only_one(target_organism, "target_organism")
    assay_organism = only_one(assay_organism, "assay_organism")
    doc_chembl_id = only_one(doc_chembl_id, "doc_chembl_id")

    # For each activity type
    for act_type in activity_types:

        df__ = df_[df_["activity_type"] == act_type]
        activity_type = list(set(df__['activity_type']))
        activity_type = only_one(activity_type, 'activity_type')
        units = list(set(df__['unit']))

        for u in units:

            # If unit is nan
            if pd.isna(u):
                df___ = df__[df__["unit"].isna()]
            else:
                df___ = df__[df__["unit"] == u]

            # Get metadata for that assay
            unit = list(set(df___['unit']))
            unit = only_one(unit, "unit")
            activities = len(df___)
            cpds = len(set(df___['compound_chembl_id']))
            nan_values = len(df___[df___['value'].isna()])
            ASSAYS_INFO.append([assay, assay_type, assay_organism, doc_chembl_id, target_type, target_chembl_id, target_organism, activity_type, unit, activities, nan_values, cpds])

    break

ASSAYS_INFO = pd.DataFrame(ASSAYS_INFO, columns=["assay_id", "assay_type", "assay_organism", "doc_chembl_id", "target_type", "target_chembl_id", "target_organism", 
                                                    "activity_type", "unit", "activities", 'nan_values', "cpds"])

# Sort assays by compound count
ASSAYS_INFO = ASSAYS_INFO.sort_values('cpds', ascending=False).reset_index(drop=True)


# Save assays info
ASSAYS_INFO.to_csv(os.path.join(PATH_TO_OUTPUT, 'assays_raw.csv'), index=False)

  0%|          | 0/12433 [00:00<?, ?it/s]


In [28]:
df___

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,assay_organism,doc_chembl_id,tid,target_type,target_organism,...,MW,standardized_MW,pchembl,bao_endpoint,value,unit,activity_type,relation,pchembl_calculated,text_flag
11398,2444329,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,580.773,,,BAO_0002146,0.688737,umol.L-1,MIC,=,6.161946,0.0
11399,2444330,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,822.953,,,BAO_0002146,0.607568,umol.L-1,MIC,=,6.216405,0.0
11400,2444331,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,564.774,,,BAO_0002146,1.593558,umol.L-1,MIC,=,5.797632,0.0
11401,2444332,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,548.775,,,BAO_0002146,2.733361,umol.L-1,MIC,=,5.563303,0.0
11402,2444333,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,550.791,,,BAO_0002146,3.08647,umol.L-1,MIC,=,5.510538,0.0
11403,2444334,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,564.774,,,BAO_0002146,3.187116,umol.L-1,MIC,=,5.496602,0.0
11404,2444335,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,398.591,,,BAO_0002146,4.766791,umol.L-1,MIC,=,5.321774,0.0
11405,2444336,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,1163.562,,,BAO_0002146,1.71886,umol.L-1,MIC,=,5.76476,0.0
11406,2444337,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,580.773,,,BAO_0002146,4.476792,umol.L-1,MIC,=,5.349033,0.0
11407,2444338,481098,CHEMBL1000111,F,1,Mycobacterium tuberculosis,CHEMBL1148197,50309,ORGANISM,Mycobacterium tuberculosis,...,564.774,,,BAO_0002146,6.728355,umol.L-1,MIC,=,5.172091,0.0
