In [1]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv("../config/chembl_processed/activities_preprocessed.csv", low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print("Filtering out nan values...")
ChEMBL = ChEMBL[ChEMBL['value'].isna() == False].reset_index(drop=True)
print(f"Size after filtering nan values: {len(ChEMBL)}")

Loading ChEMBL preprocessed data...
Original size: 24267312
Filtering out nan values...
Size after filtering nan values: 20911360


In [4]:
# Get mtb data
pathogen = "Mycobacterium tuberculosis"
ChEMBL = ChEMBL[ChEMBL['target_organism'].str.contains(pathogen, case=False, na=False) | 
                ChEMBL['assay_organism'].str.contains(pathogen, case=False, na=False)].reset_index(drop=True)

print(f"Number of activities: {len(ChEMBL)}")

df = dict(Counter(ChEMBL['target_organism']))
df = pd.DataFrame([[i, df[i]] for i in sorted(df, key = lambda x: df[x], reverse=True)], columns=['organism', 'count'])
df

Number of activities: 706206


Unnamed: 0,organism,count
0,Mycobacterium tuberculosis (strain ATCC 25618 ...,280931
1,Mycobacterium tuberculosis,215854
2,,205593
3,Mycobacterium tuberculosis variant bovis,1754
4,Mycobacterium tuberculosis variant bovis BCG,1564
5,Mycobacterium tuberculosis H37Rv,378
6,Mycobacterium tuberculosis (strain CDC 1551 / ...,99
7,Mycobacterium tuberculosis variant microti,21
8,Mycobacterium bovis (strain BCG / Pasteur 1173P2),8
9,Mycobacterium bovis,2


In [5]:
Counter(ChEMBL['assay_type'])

Counter({'F': 692556, 'B': 13552, 'A': 89, 'U': 8, 'T': 1})

In [6]:
# # Get activities data
# df = pd.read_csv("../config/chembl_activities/activities.csv", low_memory=False)

# # Assay id to doc id
# assayid_to_docid = {i: j for i,j in zip(df['assay_id'], df['doc_id'])}

# # Load dict target
# df = pd.read_csv("../config/chembl_activities/target_dictionary.csv", low_memory=False)

# # ChEMBL ID to name
# target_chemblid_to_name = {i: j for i,j in zip(df['chembl_id'], df['pref_name'])}

In [7]:
# Helper function - is there only a single value?
def only_one(values, name):
    if len(values) != 1:
        raise ValueError(f"Expected exactly one {name}, found {values}")
    return values[0]

In [8]:
assays = sorted(set(ChEMBL['assay_chembl_id']))

In [9]:
ASSAYS_INFO = []


# For each assay
for assay in tqdm(assays):

    # Get subset of strain + assay data
    df_ = ChEMBL[ChEMBL["assay_chembl_id"] == assay]
    
    # Get values
    assay_type = list(set(df_['assay_type']))
    target_type = list(set(df_['target_type']))
    target_chembl_id = list(set(df_['target_chembl_id']))
    activity_types = list(set(df_['activity_type']))
    target_organism = list(set(df_['target_organism']))
    assay_organism = list(set(df_['assay_organism']))

    # Check coherence
    assay_type = only_one(assay_type, "assay_type")
    target_type = only_one(target_type, "target_type")
    target_chembl_id = only_one(target_chembl_id, "target_chembl_id")
    target_organism = only_one(target_organism, "target_organism")
    assay_organism = only_one(assay_organism, "assay_organism")

    # For each activity type
    for act_type in activity_types:

        df__ = df_[df_["activity_type"] == act_type]
        activity_type = list(set(df__['activity_type']))
        activity_type = only_one(activity_type, 'activity_type')
        units = list(set(df__['unit']))

        for u in units:
            if type(u) != str:
                df___ = df__[df__["unit"].isna()]
            else:
                df___ = df__[df__["unit"] == u]
            unit = list(set(df___['unit']))
            unit = only_one(unit, "unit")
            activities = len(df___)
            cpds = len(set(df___['compound_chembl_id']))
            ASSAYS_INFO.append([assay, assay_type, assay_organism, target_type, target_chembl_id, target_organism, activity_type, unit, activities, cpds])

ASSAYS_INFO = pd.DataFrame(ASSAYS_INFO, columns=["assay_id", "assay_type", "assay_organism", "target_type", "target_chembl_id", "target_organism", "activity_type", "unit", "activities", "cpds"])
ASSAYS_INFO = ASSAYS_INFO.sort_values('cpds', ascending=False).reset_index(drop=True)

# # Load assay descriptions
# assay_descriptions = pd.read_csv("../config/chembl_activities/assay_descriptions.csv", low_memory=False)
# assay_descriptions = {i: j for i,j in zip(assay_descriptions['chembl_id'], assay_descriptions['description'])}
# ASSAYS_INFO['description'] = [assay_descriptions[i] for i in ASSAYS_INFO['assay_id']]

100%|██████████| 10901/10901 [04:58<00:00, 36.58it/s]


In [10]:
ASSAYS_INFO = ASSAYS_INFO[ASSAYS_INFO['cpds'] > 100].reset_index(drop=True)

In [11]:
ASSAYS_INFO

Unnamed: 0,assay_id,assay_type,assay_organism,target_type,target_chembl_id,target_organism,activity_type,unit,activities,cpds
0,CHEMBL4649948,F,Mycobacterium tuberculosis,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,93556,86590
1,CHEMBL4649949,F,Mycobacterium tuberculosis,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,101516,86576
2,CHEMBL4649971,F,Mycobacterium tuberculosis,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,PERCENTEFFECT,%,68620,68614
3,CHEMBL4649972,F,Mycobacterium tuberculosis,PROTEIN COMPLEX,CHEMBL4662931,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,68617,68611
4,CHEMBL4649941,F,Mycobacterium tuberculosis,SINGLE PROTEIN,CHEMBL4662928,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,67382,66942
...,...,...,...,...,...,...,...,...,...,...
62,CHEMBL4011046,F,Mycobacterium tuberculosis,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,GI,%,103,103
63,CHEMBL1115630,F,Mycobacterium tuberculosis,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,MIC,umol.L-1,103,103
64,CHEMBL4011010,F,Mycobacterium tuberculosis variant bovis BCG,ORGANISM,CHEMBL613086,Mycobacterium tuberculosis variant bovis,MIC90,umol.L-1,103,103
65,CHEMBL1115629,F,Mycobacterium tuberculosis,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,MIC,umol.L-1,103,103


In [42]:
assays = pd.read_csv("../config/chembl_activities/assays.csv", low_memory=False)
docs = pd.read_csv("../config/chembl_activities/docs.csv", low_memory=False)

In [43]:
assays.columns, docs.columns

(Index(['assay_id', 'doc_id', 'description', 'assay_type', 'assay_test_type',
        'assay_category', 'assay_organism', 'assay_tax_id', 'assay_strain',
        'assay_tissue', 'assay_cell_type', 'assay_subcellular_fraction', 'tid',
        'relationship_type', 'confidence_score', 'curated_by', 'src_id',
        'src_assay_id', 'chembl_id', 'cell_id', 'bao_format', 'tissue_id',
        'variant_id', 'aidx', 'assay_group'],
       dtype='object'),
 Index(['doc_id', 'journal', 'year', 'volume', 'issue', 'first_page',
        'last_page', 'pubmed_id', 'doi', 'chembl_id', 'title', 'doc_type',
        'authors', 'abstract', 'patent_id', 'ridx', 'src_id',
        'chembl_release_id', 'contact'],
       dtype='object'))

In [33]:
assay_type_map = {"F": "Functional","B": "Binding","T": "Toxicity","A": "ADME","P": "Physicochemical","U": "Uncategorized"}

In [54]:
docs.columns

Index(['doc_id', 'journal', 'year', 'volume', 'issue', 'first_page',
       'last_page', 'pubmed_id', 'doi', 'chembl_id', 'title', 'doc_type',
       'authors', 'abstract', 'patent_id', 'ridx', 'src_id',
       'chembl_release_id', 'contact'],
      dtype='object')

In [72]:
for i in ASSAYS_INFO[['assay_type', 'assay_organism', 'target_type', 'target_organism', 'activity_type', 'unit', 'activities', 'cpds', 'assay_id']].values:
    
    print(f"Assay ID: {i[8]}")
    print(f"Assay type: {assay_type_map[i[0]]}")
    print(f"Assay Organism: {i[1]}")
    print(f"Assay description: {assays[assays['chembl_id'] == i[8]]['description'].tolist()[0]}")

    doc_id = assays[assays['chembl_id'] == i[8]]['doc_id'].tolist()[0]

    print(f"Document title: {docs[docs['doc_id'] == doc_id]['title'].tolist()[0]}")
    print(f"Document abstract: {docs[docs['doc_id'] == doc_id]['abstract'].tolist()[0]}")
    print(f"Document journal: {docs[docs['doc_id'] == doc_id]['journal'].tolist()[0]}")
    print(f"Document PubMed ID: {docs[docs['doc_id'] == doc_id]['pubmed_id'].tolist()[0]}")
    print(f"Document DOI: {docs[docs['doc_id'] == doc_id]['doi'].tolist()[0]}")

    print(f"Target type: {i[2]}")
    print(f"Target Organism: {i[3]}")

    print(f"Activity Type: {i[4]}")
    print(f"Unit: {i[5]}")
    print(f"Number of activities: {i[6]}")
    print(f"Number of compounds: {i[7]}")

    assay_activities = ChEMBL = ChEMBL[(ChEMBL['assay_chembl_id'] == i[8]) & (ChEMBL['activity_type'] == i[4]) & (ChEMBL['unit'] == i[5])]
    assay_activities = assay_activities["value"].astype(float).tolist()

    print(f"Percentile 1: {round(np.percentile(assay_activities, 1), 3)}")
    print(f"Percentile 25: {round(np.percentile(assay_activities, 25), 3)}")
    print(f"Mean: {round(np.mean(assay_activities), 3)}")
    print(f"Median: {round(np.percentile(assay_activities, 50), 3)}")
    print(f"Percentile 75: {round(np.percentile(assay_activities, 75), 3)}")
    print(f"Percentile 99: {round(np.percentile(assay_activities, 99), 3)}")

    break

Assay ID: CHEMBL4649948
Assay type: Functional
Assay Organism: Mycobacterium tuberculosis
Assay description: Phenotypic growth assay for Mycobacterium tuberculosis grown for 4 days on DPPC, cholesterol, tyloxapol based media
Document title: University of Dundee, Small-Polar-MMV Screening Library
Document abstract: nan
Document journal: nan
Document PubMed ID: nan
Document DOI: 10.6019/CHEMBL3988442
Target type: UNCHECKED
Target Organism: nan
Activity Type: PERCENTEFFECT
Unit: %
Number of activities: 93556
Number of compounds: 86590
Percentile 1: -40.22
Percentile 25: -10.67
Mean: -1.267
Median: -1.728
Percentile 75: 7.254
Percentile 99: 57.08


In [73]:
doc_id

103486

In [None]:
# # Assay chembl id to assay id
# assay_chemblid_to_assayid = {i: j for i,j in zip(ChEMBL['assay_chembl_id'], ChEMBL['assay_id'])}
# ASSAYS_INFO['target_pref_name'] = [target_chemblid_to_name[i] for i in ASSAYS_INFO['target_chembl_id']]
# ASSAYS_INFO['doc_id'] = [assayid_to_docid[assay_chemblid_to_assayid[i]] for i in ASSAYS_INFO['assay_id']]

In [None]:
print("Number of target organisms:", len(set(ASSAYS_INFO['target_organism'])))
print("Number of assays:", len(set(ASSAYS_INFO['assay_id'])))
print("Assay types:", Counter(ASSAYS_INFO['assay_type']))
print("Target types:", Counter(ASSAYS_INFO['target_type']))
print(f"Act type - unit pairs: {len(Counter([(i,j) for i,j in zip(ASSAYS_INFO['activity_type'], ASSAYS_INFO['unit'])]))}")

In [None]:
len(Counter(ASSAYS_INFO['target_chembl_id']))