In [1]:
from rdkit.Chem import Descriptors
from collections import Counter
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [2]:
# Load ChEMBL activities
root = "."
activities = pd.read_csv(os.path.join(root, "..", "config", "chembl_activities", "activities.csv"), low_memory=False)

# Filter columns
columns = ['activity_id', 'assay_id', 'molregno','standard_relation', 'standard_value', 'standard_units', 'standard_type', 'activity_comment',
           'data_validity_comment', 'pchembl_value','standard_upper_value','standard_text_value', 'action_type']
activities = activities[columns]

# Load assays
assays = pd.read_csv(os.path.join(root, "..", "config", "chembl_activities", "assays.csv"), low_memory=False)

# Load targets
targets = pd.read_csv(os.path.join(root, "..", "config", "chembl_activities", "target_dictionary.csv"), low_memory=True)

# Load compounds
compounds = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "compound_info.csv"), low_memory=True)


print(f"Number of activities: {len(set(activities['activity_id']))}")
print(f"Number of unique compounds in activities: {len(set(activities['molregno']))}")
print(f"Number of unique assays in activities: {len(set(activities['assay_id']))}")

Number of activities: 24267312
Number of unique compounds in activities: 2774266
Number of unique assays in activities: 1890531


In [3]:
print(len(assays), Counter(assays['assay_type']))

1890749 Counter({'F': 884978, 'B': 595857, 'A': 311430, 'T': 68178, 'P': 26618, 'U': 3688})


In [4]:
print(len(targets), Counter(targets['target_type']))

17803 Counter({'SINGLE PROTEIN': 10962, 'ORGANISM': 2383, 'CELL-LINE': 1997, 'PROTEIN COMPLEX': 645, 'PROTEIN-PROTEIN INTERACTION': 596, 'PROTEIN FAMILY': 428, 'TISSUE': 294, 'SELECTIVITY GROUP': 123, 'NUCLEIC-ACID': 112, 'PROTEIN COMPLEX GROUP': 66, 'SMALL MOLECULE': 41, 'CHIMERIC PROTEIN': 34, 'OLIGOSACCHARIDE': 22, 'UNKNOWN': 20, 'SUBCELLULAR': 18, 'MACROMOLECULE': 18, 'PROTEIN NUCLEIC-ACID COMPLEX': 11, 'LIPID': 11, 'METAL': 10, '3D CELL CULTURE': 6, 'PHENOTYPE': 2, 'NON-MOLECULAR': 1, 'ADMET': 1, 'UNCHECKED': 1, 'NO TARGET': 1})


In [5]:
print(len(compounds), Counter(compounds['molecule_type']))

2854815 Counter({'Small molecule': 1915414, nan: 526709, 'Unknown': 390341, 'Protein': 22241, 'Oligonucleotide': 57, 'Oligosaccharide': 53})


In [9]:
LABEL = 'standard_type'
print(len(set(activities[LABEL])))
s = activities[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
out[:70]

6449


Unnamed: 0,standard_type,count,cumulative_prop
0,Potency,4473542,0.184
1,IC50,3552865,0.331
2,GI50,2629061,0.439
3,Inhibition,1593108,0.505
4,Activity,1357448,0.561
...,...,...,...
65,Selectivity ratio,22218,0.923
66,Ratio EC50,22151,0.924
67,Ratio CC50/EC50,21436,0.925
68,LogP,20536,0.926


In [7]:
LABEL = 'standard_units'
print(len(set(activities[LABEL])))
s = activities[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
out[:70]

2700


Unnamed: 0,standard_units,count,cumulative_prop
0,nM,12587576,0.519
1,%,5200597,0.733
2,,3354717,0.871
3,ug.mL-1,952582,0.911
4,s-1,828026,0.945
...,...,...,...
65,kcal/mol,1586,0.994
66,min-1,1578,0.994
67,equiv,1528,0.994
68,pmol,1469,0.994


In [8]:
LABEL = ['standard_type', 'standard_units']
s = pd.DataFrame([str(i) + " -- " + str(j) for i,j in zip(activities['standard_type'], activities['standard_units'])])
print(len(s))
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
out[:70]

24267312


Unnamed: 0,0,count,cumulative_prop
0,Potency -- nM,4471741,0.184
1,IC50 -- nM,3276327,0.319
2,GI50 -- nM,2617582,0.427
3,Inhibition -- %,1589749,0.493
4,Percent Effect -- %,1328366,0.547
...,...,...,...
65,GI -- nan,26480,0.900
66,Cmax -- nM,25644,0.901
67,MIC50 -- ug.mL-1,25477,0.902
68,Delta TM -- C,24914,0.903
