In [3]:
from rdkit.Chem import Descriptors
from collections import Counter
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [4]:
# Load ChEMBL activities
root = "."
activities = pd.read_csv(os.path.join(root, "..", "data", "chembl_activities", "activities.csv"), low_memory=False)

# Filter columns
columns = ['activity_id', 'assay_id', 'molregno','standard_relation', 'standard_value', 'standard_units', 'standard_type', 'activity_comment',
           'data_validity_comment', 'pchembl_value','standard_upper_value','standard_text_value', 'action_type']
activities = activities[columns]

# Load assays
assays = pd.read_csv(os.path.join(root, "..", "data", "chembl_activities", "assays.csv"), low_memory=False)

# Load targets
targets = pd.read_csv(os.path.join(root, "..", "data", "chembl_activities", "target_dictionary.csv"), low_memory=True)

# Load compounds
compounds = pd.read_csv(os.path.join(root, "..", "data", "chembl_processed", "compound_info.csv"), low_memory=True)


print(f"Number of activities: {len(set(activities['activity_id']))}")
print(f"Number of unique compounds in activities: {len(set(activities['molregno']))}")
print(f"Number of unique assays in activities: {len(set(activities['assay_id']))}")

Number of activities: 24267312
Number of unique compounds in activities: 2774266
Number of unique assays in activities: 1890531


In [16]:
Counter(activities['standard_relation'])

Counter({'=': 14873384,
         nan: 7405968,
         '>': 1576805,
         '<': 341490,
         '<=': 33282,
         '>=': 31999,
         '~': 3925,
         '>>': 435,
         '<<': 24})

In [5]:
print(len(assays), Counter(assays['assay_type']))

1890749 Counter({'F': 884978, 'B': 595857, 'A': 311430, 'T': 68178, 'P': 26618, 'U': 3688})


In [6]:
print(len(targets), Counter(targets['target_type']))

17803 Counter({'SINGLE PROTEIN': 10962, 'ORGANISM': 2383, 'CELL-LINE': 1997, 'PROTEIN COMPLEX': 645, 'PROTEIN-PROTEIN INTERACTION': 596, 'PROTEIN FAMILY': 428, 'TISSUE': 294, 'SELECTIVITY GROUP': 123, 'NUCLEIC-ACID': 112, 'PROTEIN COMPLEX GROUP': 66, 'SMALL MOLECULE': 41, 'CHIMERIC PROTEIN': 34, 'OLIGOSACCHARIDE': 22, 'UNKNOWN': 20, 'SUBCELLULAR': 18, 'MACROMOLECULE': 18, 'PROTEIN NUCLEIC-ACID COMPLEX': 11, 'LIPID': 11, 'METAL': 10, '3D CELL CULTURE': 6, 'PHENOTYPE': 2, 'NON-MOLECULAR': 1, 'ADMET': 1, 'UNCHECKED': 1, 'NO TARGET': 1})


In [9]:
print(len(compounds))

2854815


In [10]:
LABEL = 'standard_type'
print(len(set(activities[LABEL])))
s = activities[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
out[:100]

6449


Unnamed: 0,standard_type,count,cumulative_prop
0,Potency,4473542,0.184
1,IC50,3552865,0.331
2,GI50,2629061,0.439
3,Inhibition,1593108,0.505
4,Activity,1357448,0.561
...,...,...,...
95,CLH,13593,0.945
96,Cell Viability,13000,0.946
97,pKa,12890,0.946
98,%Inhib (Mean),12829,0.947


In [11]:
LABEL = 'standard_units'
print(len(set(activities[LABEL])))
s = activities[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
out[:100]

2700


Unnamed: 0,standard_units,count,cumulative_prop
0,nM,12587576,0.519
1,%,5200597,0.733
2,,3354717,0.871
3,ug.mL-1,952582,0.911
4,s-1,828026,0.945
...,...,...,...
95,p.p.m.,769,0.996
96,M-1 min-1,742,0.996
97,%ID/g,739,0.996
98,m equiv,719,0.996


In [12]:
LABEL = ['standard_type', 'standard_units']
s = pd.DataFrame([str(i) + " -- " + str(j) for i,j in zip(activities['standard_type'], activities['standard_units'])])
print(len(s))
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(len(out))
out[:100]

24267312
14594


Unnamed: 0,0,count,cumulative_prop
0,Potency -- nM,4471741,0.184
1,IC50 -- nM,3276327,0.319
2,GI50 -- nM,2617582,0.427
3,Inhibition -- %,1589749,0.493
4,Percent Effect -- %,1328366,0.547
...,...,...,...
95,Control DMSO Apoptotic Cells (%) -- %,14544,0.924
96,PPB -- %,14176,0.925
97,Tmax -- hr,14107,0.926
98,Ratio CC50/IC50 -- nan,13826,0.926


In [14]:
out[out[0].str.contains('IC50')][:50]

Unnamed: 0,0,count,cumulative_prop
1,IC50 -- nM,3276327,0.319
15,IC50 -- nan,196043,0.788
17,Ratio IC50 -- nan,142161,0.8
26,IC50 -- ug.mL-1,78221,0.839
67,MIC50 -- ug.mL-1,25477,0.902
76,LUCIFERASE EXPRESSION CONTROL - IC50 -- uM,19978,0.911
77,LUCIFERASE INFECTION ASSAY - IC50 -- uM,19978,0.911
98,Ratio CC50/IC50 -- nan,13826,0.926
231,MIC50 -- nM,3359,0.962
288,pIC50 -- nan,2092,0.969
