In [43]:
from collections import Counter
import pandas as pd
import numpy as np
import os

In [7]:
root = "."
activities = pd.read_csv(os.path.join(root, "..", "config", "chembl_activities", "activities.csv"), low_memory=False)

In [8]:
print(f"Number of activities: {len(set(activities['activity_id']))}")
print(f"Number of unique compounds: {len(set(activities['molregno']))}")
print(f"Number of unique assays: {len(set(activities['assay_id']))}")

Number of activities: 24267312
Number of unique compounds: 2774266
Number of unique assays: 1890531


In [9]:
activities.columns

Index(['activity_id', 'assay_id', 'doc_id', 'record_id', 'molregno',
       'standard_relation', 'standard_value', 'standard_units',
       'standard_flag', 'standard_type', 'activity_comment',
       'data_validity_comment', 'potential_duplicate', 'pchembl_value',
       'bao_endpoint', 'uo_units', 'qudt_units', 'toid', 'upper_value',
       'standard_upper_value', 'src_id', 'type', 'relation', 'value', 'units',
       'text_value', 'standard_text_value', 'action_type'],
      dtype='object')

In [10]:
# Filter columns
columns = ['activity_id', 'assay_id', 'molregno','standard_relation', 'standard_value', 'standard_units', 'standard_type', 'activity_comment',
           'data_validity_comment', 'pchembl_value','standard_upper_value','standard_text_value', 'action_type']
activities = activities[columns]

In [56]:
activities

Unnamed: 0,activity_id,assay_id,molregno,standard_relation,standard_value,standard_units,standard_type,activity_comment,data_validity_comment,pchembl_value,standard_upper_value,standard_text_value,action_type
0,31863,54505,180094,>,100000.0,nM,IC50,,,,,,
1,31864,83907,182268,=,2500.0,nM,IC50,,,5.60,,,
2,31865,88152,182268,>,50000.0,nM,IC50,,,,,,
3,31866,83907,182855,=,9000.0,nM,IC50,,,5.05,,,
4,31867,88153,182855,,,nM,IC50,Not Determined,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24267307,29054631,2596842,3238772,=,,,kon,1297011,,,,,
24267308,29054632,2596842,3238772,=,,s-1,k_off,1297011,,,,,
24267309,29054633,2596842,3195979,=,503.0,nM,IC50,1297012,,6.30,,,
24267310,29054634,2596842,3195979,=,,,kon,1297012,,,,,


In [57]:
len(activities[activities['standard_value'].isna()])

3345340

In [58]:
type_unit = Counter([(i,j) for i,j in zip(activities['standard_type'], activities['standard_units']) if i == 'IC50'])

In [60]:
len(type_unit)

101

In [61]:
units = Counter(activities['standard_units'].tolist())
units = pd.DataFrame([[i, units[i]] for i in sorted(units, key=lambda x: units[x])[::-1]], columns=['unit', 'count'])

total_count = units['count'].sum()
units['cumulative_prop'] = (units['count'].cumsum() / total_count).round(3)

In [48]:
u = "\n".join(np.array(units['unit'].tolist()).astype(str))
with open(os.path.join(root, "..", "config", "chembl_activities", 'units.csv'), "w") as out:
    out.write(u)

In [62]:
units[:10]

Unnamed: 0,unit,count,cumulative_prop
0,nM,12587576,0.519
1,%,5200597,0.733
2,,3354717,0.871
3,ug.mL-1,952582,0.911
4,s-1,828026,0.945
5,uM,257898,0.955
6,hr,128555,0.961
7,mm,95027,0.964
8,ug ml-1,77448,0.968
9,mg.kg-1,65710,0.97


In [63]:
units[:50]

Unnamed: 0,unit,count,cumulative_prop
0,nM,12587576,0.519
1,%,5200597,0.733
2,,3354717,0.871
3,ug.mL-1,952582,0.911
4,s-1,828026,0.945
5,uM,257898,0.955
6,hr,128555,0.961
7,mm,95027,0.964
8,ug ml-1,77448,0.968
9,mg.kg-1,65710,0.97
