In [16]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv("../config/chembl_processed/activities_preprocessed.csv", low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print("Filtering out nan values...")
ChEMBL = ChEMBL[ChEMBL['value'].isna() == False].reset_index(drop=True)
print(f"Size after filtering nan values: {len(ChEMBL)}")

Loading ChEMBL preprocessed data...
Original size: 24267312
Filtering out nan values...
Size after filtering nan values: 20911360


In [3]:
df = dict(Counter(ChEMBL['target_organism']))
df = pd.DataFrame([[i, df[i]] for i in sorted(df, key = lambda x: df[x], reverse=True)], columns=['organism', 'count'])
df[df['organism'].str.contains("mycobacterium tuberculosis", case=False, na=False)]

Unnamed: 0,organism,count
5,Mycobacterium tuberculosis (strain ATCC 25618 ...,280931
7,Mycobacterium tuberculosis,215854
144,Mycobacterium tuberculosis variant bovis,1754
153,Mycobacterium tuberculosis variant bovis BCG,1564
386,Mycobacterium tuberculosis H37Rv,378
817,Mycobacterium tuberculosis (strain CDC 1551 / ...,99
1531,Mycobacterium tuberculosis variant microti,21
2593,Mycobacterium tuberculosis (strain ATCC 25177 ...,1


In [4]:
# Get mtb data
ChEMBL = ChEMBL[ChEMBL['target_organism'].str.contains("mycobacterium tuberculosis", case=False, na=False)].reset_index(drop=True)

In [5]:
# Helper function - is there only a single value?
def only_one(values, name):
    if len(values) != 1:
        raise ValueError(f"Expected exactly one {name}, found {values}")
    return values[0]

In [6]:
strains = sorted(set(ChEMBL['target_organism']))

In [7]:
len(Counter(ChEMBL[ChEMBL['target_organism'] == "Mycobacterium tuberculosis"]['assay_chembl_id']))

9563

In [29]:
ASSAYS_INFO = []

for strain in strains:

    print(strain)
    # Get subset of strain data
    df = ChEMBL[ChEMBL['target_organism'] == strain]
    assays = sorted(set(df['assay_chembl_id']))

    # For each assay
    for assay in tqdm(assays):

        # Get subset of strain + assay data
        df_ = df[df["assay_chembl_id"] == assay]

        # Get values
        assay_type = list(set(df_['assay_type']))
        target_type = list(set(df_['target_type']))
        target_chembl_id = list(set(df_['target_chembl_id']))
        activity_types = list(set(df_['activity_type']))

        # Check coherence
        assay_type = only_one(assay_type, "assay_type")
        target_type = only_one(target_type, "target_type")
        target_chembl_id = only_one(target_chembl_id, "target_chembl_id")

        # For each activity type
        for act_type in activity_types:

            df__ = df_[df_["activity_type"] == act_type]
            activity_type = list(set(df__['activity_type']))
            activity_type = only_one(activity_type, 'activity_type')
            units = list(set(df__['unit']))

            for u in units:
                if type(u) != str:
                    df___ = df__[df__["unit"].isna()]
                else:
                    df___ = df__[df__["unit"] == u]
                unit = list(set(df___['unit']))
                unit = only_one(unit, "unit")
                cpds = len(df___)
                ASSAYS_INFO.append([strain, assay, assay_type, target_type, target_chembl_id, activity_type, unit, cpds])

ASSAYS_INFO = pd.DataFrame(ASSAYS_INFO, columns=["strain", "assay", "assay_type", "target_type", "target_chembl_id", "activity_type", "unit", "cpds"])
ASSAYS_INFO = ASSAYS_INFO.sort_values('cpds', ascending=False).reset_index(drop=True)

Mycobacterium tuberculosis


100%|██████████| 9563/9563 [01:23<00:00, 114.24it/s]


Mycobacterium tuberculosis (strain ATCC 25177 / H37Ra)


100%|██████████| 1/1 [00:00<00:00, 1457.37it/s]


Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)


100%|██████████| 192/192 [00:02<00:00, 82.94it/s]


Mycobacterium tuberculosis (strain CDC 1551 / Oshkosh)


100%|██████████| 11/11 [00:00<00:00, 1857.38it/s]


Mycobacterium tuberculosis H37Rv


100%|██████████| 15/15 [00:00<00:00, 2066.36it/s]


Mycobacterium tuberculosis variant bovis


100%|██████████| 225/225 [00:00<00:00, 2052.24it/s]


Mycobacterium tuberculosis variant bovis BCG


100%|██████████| 97/97 [00:00<00:00, 2000.04it/s]


Mycobacterium tuberculosis variant microti


100%|██████████| 1/1 [00:00<00:00, 1371.58it/s]


In [68]:
ASSAYS_INFO[:50]

Unnamed: 0,strain,assay,assay_type,target_type,target_chembl_id,activity_type,unit,cpds
0,Mycobacterium tuberculosis,CHEMBL4649971,F,ORGANISM,CHEMBL360,PERCENTEFFECT,%,68620
1,Mycobacterium tuberculosis (strain ATCC 25618 ...,CHEMBL4649972,F,PROTEIN COMPLEX,CHEMBL4662931,PERCENTEFFECT,%,68617
2,Mycobacterium tuberculosis (strain ATCC 25618 ...,CHEMBL4649941,F,SINGLE PROTEIN,CHEMBL4662928,PERCENTEFFECT,%,67382
3,Mycobacterium tuberculosis (strain ATCC 25618 ...,CHEMBL4649965,F,SINGLE PROTEIN,CHEMBL4105939,PERCENTEFFECT,%,66598
4,Mycobacterium tuberculosis (strain ATCC 25618 ...,CHEMBL4649957,F,SINGLE PROTEIN,CHEMBL4662922,PERCENTEFFECT,%,65034
5,Mycobacterium tuberculosis,CHEMBL4649961,F,ORGANISM,CHEMBL360,PERCENTEFFECT,%,53171
6,Mycobacterium tuberculosis (strain ATCC 25618 ...,CHEMBL4649947,F,SINGLE PROTEIN,CHEMBL4662921,PERCENTEFFECT,%,8841
7,Mycobacterium tuberculosis,CHEMBL1794349,F,SINGLE PROTEIN,CHEMBL1741192,AC50,umol.L-1,2128
8,Mycobacterium tuberculosis,CHEMBL1794426,F,SINGLE PROTEIN,CHEMBL1741171,EC50,umol.L-1,2123
9,Mycobacterium tuberculosis,CHEMBL1794324,F,SINGLE PROTEIN,CHEMBL1741192,AC50,umol.L-1,2071


In [65]:
MERGING = []
for strain, assay_type, target_chembl_id, activity_type, unit in ASSAYS_INFO[:10][["strain", "assay_type", "target_chembl_id", "activity_type", "unit"]].values:
    MERGING.append((strain, assay_type, target_chembl_id, activity_type, unit))

In [67]:
Counter(MERGING)

Counter({('Mycobacterium tuberculosis',
          'F',
          'CHEMBL360',
          'PERCENTEFFECT',
          '%'): 2,
         ('Mycobacterium tuberculosis',
          'F',
          'CHEMBL1741192',
          'AC50',
          'umol.L-1'): 2,
         ('Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)',
          'F',
          'CHEMBL4662931',
          'PERCENTEFFECT',
          '%'): 1,
         ('Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)',
          'F',
          'CHEMBL4662928',
          'PERCENTEFFECT',
          '%'): 1,
         ('Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)',
          'F',
          'CHEMBL4105939',
          'PERCENTEFFECT',
          '%'): 1,
         ('Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)',
          'F',
          'CHEMBL4662922',
          'PERCENTEFFECT',
          '%'): 1,
         ('Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)',
          'F',
          'CHEMBL4662921',
          'PERCE

In [50]:
ASSAYS_INFO[:10][["strain", "assay_type", "target_chembl_id", "activity_type", "unit"]]

Unnamed: 0,strain,assay_type,target_chembl_id,activity_type,unit
0,Mycobacterium tuberculosis,F,CHEMBL360,PERCENTEFFECT,%
1,Mycobacterium tuberculosis (strain ATCC 25618 ...,F,CHEMBL4662931,PERCENTEFFECT,%
2,Mycobacterium tuberculosis (strain ATCC 25618 ...,F,CHEMBL4662928,PERCENTEFFECT,%
3,Mycobacterium tuberculosis (strain ATCC 25618 ...,F,CHEMBL4105939,PERCENTEFFECT,%
4,Mycobacterium tuberculosis (strain ATCC 25618 ...,F,CHEMBL4662922,PERCENTEFFECT,%
5,Mycobacterium tuberculosis,F,CHEMBL360,PERCENTEFFECT,%
6,Mycobacterium tuberculosis (strain ATCC 25618 ...,F,CHEMBL4662921,PERCENTEFFECT,%
7,Mycobacterium tuberculosis,F,CHEMBL1741192,AC50,umol.L-1
8,Mycobacterium tuberculosis,F,CHEMBL1741171,EC50,umol.L-1
9,Mycobacterium tuberculosis,F,CHEMBL1741192,AC50,umol.L-1
