In [1]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv("../config/chembl_processed/activities_preprocessed.csv", low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print("Filtering out nan values...")
ChEMBL = ChEMBL[ChEMBL['value'].isna() == False].reset_index(drop=True)
print(f"Size after filtering nan values: {len(ChEMBL)}")

Loading ChEMBL preprocessed data...
Original size: 24267312
Filtering out nan values...
Size after filtering nan values: 20911360


In [14]:
# Get mtb data
pathogen = "Mycobacterium tuberculosis"
ChEMBL = ChEMBL[ChEMBL['target_organism'].str.contains(pathogen, case=False, na=False) | 
                ChEMBL['assay_organism'].str.contains(pathogen, case=False, na=False)].reset_index(drop=True)

print(f"Number of activities: {len(ChEMBL)}")

df = dict(Counter(ChEMBL['target_organism']))
df = pd.DataFrame([[i, df[i]] for i in sorted(df, key = lambda x: df[x], reverse=True)], columns=['organism', 'count'])
df

Number of activities: 706206


Unnamed: 0,organism,count
0,Mycobacterium tuberculosis (strain ATCC 25618 ...,280931
1,Mycobacterium tuberculosis,215854
2,,205593
3,Mycobacterium tuberculosis variant bovis,1754
4,Mycobacterium tuberculosis variant bovis BCG,1564
5,Mycobacterium tuberculosis H37Rv,378
6,Mycobacterium tuberculosis (strain CDC 1551 / ...,99
7,Mycobacterium tuberculosis variant microti,21
8,Mycobacterium bovis (strain BCG / Pasteur 1173P2),8
9,Mycobacterium bovis,2


In [15]:
# # Get activities data
# df = pd.read_csv("../config/chembl_activities/activities.csv", low_memory=False)

# # Assay id to doc id
# assayid_to_docid = {i: j for i,j in zip(df['assay_id'], df['doc_id'])}

# # Load dict target
# df = pd.read_csv("../config/chembl_activities/target_dictionary.csv", low_memory=False)

# # ChEMBL ID to name
# target_chemblid_to_name = {i: j for i,j in zip(df['chembl_id'], df['pref_name'])}

In [16]:
# Helper function - is there only a single value?
def only_one(values, name):
    if len(values) != 1:
        raise ValueError(f"Expected exactly one {name}, found {values}")
    return values[0]

In [54]:
assays = sorted(set(ChEMBL['assay_chembl_id']))

In [60]:
ASSAYS_INFO = []


# For each assay
for assay in tqdm(assays):

    # Get subset of strain + assay data
    df_ = ChEMBL[ChEMBL["assay_chembl_id"] == assay]
    
    # Get values
    assay_type = list(set(df_['assay_type']))
    target_type = list(set(df_['target_type']))
    target_chembl_id = list(set(df_['target_chembl_id']))
    activity_types = list(set(df_['activity_type']))
    target_organism = list(set(df_['target_organism']))
    assay_organism = list(set(df_['assay_organism']))

    # Check coherence
    assay_type = only_one(assay_type, "assay_type")
    target_type = only_one(target_type, "target_type")
    target_chembl_id = only_one(target_chembl_id, "target_chembl_id")
    target_organism = only_one(target_organism, "target_organism")
    assay_organism = only_one(assay_organism, "assay_organism")

    # For each activity type
    for act_type in activity_types:

        df__ = df_[df_["activity_type"] == act_type]
        activity_type = list(set(df__['activity_type']))
        activity_type = only_one(activity_type, 'activity_type')
        units = list(set(df__['unit']))

        for u in units:
            if type(u) != str:
                df___ = df__[df__["unit"].isna()]
            else:
                df___ = df__[df__["unit"] == u]
            unit = list(set(df___['unit']))
            unit = only_one(unit, "unit")
            activities = len(df___)
            cpds = len(set(df___['compound_chembl_id']))
            ASSAYS_INFO.append([assay, assay_type, assay_organism, target_type, target_chembl_id, target_organism, activity_type, unit, activities, cpds])

ASSAYS_INFO = pd.DataFrame(ASSAYS_INFO, columns=["assay_id", "assay_type", "assay_organism", "target_type", "target_chembl_id", "target_organism", "activity_type", "unit", "activities", "cpds"])
ASSAYS_INFO = ASSAYS_INFO.sort_values('cpds', ascending=False).reset_index(drop=True)

# # Load assay descriptions
# assay_descriptions = pd.read_csv("../config/chembl_activities/assay_descriptions.csv", low_memory=False)
# assay_descriptions = {i: j for i,j in zip(assay_descriptions['chembl_id'], assay_descriptions['description'])}
# ASSAYS_INFO['description'] = [assay_descriptions[i] for i in ASSAYS_INFO['assay_id']]

100%|██████████| 10901/10901 [04:55<00:00, 36.84it/s]


In [62]:
ASSAYS_INFO[:20]

Unnamed: 0,assay_id,assay_type,assay_organism,target_type,target_chembl_id,target_organism,activity_type,unit,activities,cpds
0,CHEMBL4649948,F,Mycobacterium tuberculosis,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,93556,86590
1,CHEMBL4649949,F,Mycobacterium tuberculosis,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,101516,86576
2,CHEMBL4649971,F,Mycobacterium tuberculosis,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,PERCENTEFFECT,%,68620,68614
3,CHEMBL4649972,F,Mycobacterium tuberculosis,PROTEIN COMPLEX,CHEMBL4662931,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,68617,68611
4,CHEMBL4649941,F,Mycobacterium tuberculosis,SINGLE PROTEIN,CHEMBL4662928,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,67382,66942
5,CHEMBL4649965,F,Mycobacterium tuberculosis,SINGLE PROTEIN,CHEMBL4105939,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,66598,66592
6,CHEMBL4649957,F,Mycobacterium tuberculosis,SINGLE PROTEIN,CHEMBL4662922,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,65034,65028
7,CHEMBL4649961,F,Mycobacterium tuberculosis,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,PERCENTEFFECT,%,53171,53166
8,CHEMBL4649947,F,Mycobacterium tuberculosis,SINGLE PROTEIN,CHEMBL4662921,Mycobacterium tuberculosis (strain ATCC 25618 ...,PERCENTEFFECT,%,8841,8841
9,CHEMBL4649949,F,Mycobacterium tuberculosis,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,2468,2468


In [None]:
# # Assay chembl id to assay id
# assay_chemblid_to_assayid = {i: j for i,j in zip(ChEMBL['assay_chembl_id'], ChEMBL['assay_id'])}
# ASSAYS_INFO['target_pref_name'] = [target_chemblid_to_name[i] for i in ASSAYS_INFO['target_chembl_id']]
# ASSAYS_INFO['doc_id'] = [assayid_to_docid[assay_chemblid_to_assayid[i]] for i in ASSAYS_INFO['assay_id']]

In [46]:
print("Number of target organisms:", len(set(ASSAYS_INFO['target_organism'])))
print("Number of assays:", len(set(ASSAYS_INFO['assay_id'])))
print("Assay types:", Counter(ASSAYS_INFO['assay_type']))
print("Target types:", Counter(ASSAYS_INFO['target_type']))
print(f"Act type - unit pairs: {len(Counter([(i,j) for i,j in zip(ASSAYS_INFO['activity_type'], ASSAYS_INFO['unit'])]))}")

Number of target organisms: 1
Number of assays: 1
Assay types: Counter({'F': 1})
Target types: Counter({'ORGANISM': 1})
Act type - unit pairs: 1


In [None]:
len(Counter(ASSAYS_INFO['target_chembl_id']))