In [1]:
from rdkit.Chem import Descriptors
from collections import Counter
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [2]:
root = "."
activities = pd.read_csv(os.path.join(root, "..", "config", "chembl_activities", "activities.csv"), low_memory=False)

print(f"Number of activities: {len(set(activities['activity_id']))}")
print(f"Number of unique compounds: {len(set(activities['molregno']))}")
print(f"Number of unique assays: {len(set(activities['assay_id']))}")

Number of activities: 24267312
Number of unique compounds: 2774266
Number of unique assays: 1890531


In [3]:
activities.columns

Index(['activity_id', 'assay_id', 'doc_id', 'record_id', 'molregno',
       'standard_relation', 'standard_value', 'standard_units',
       'standard_flag', 'standard_type', 'activity_comment',
       'data_validity_comment', 'potential_duplicate', 'pchembl_value',
       'bao_endpoint', 'uo_units', 'qudt_units', 'toid', 'upper_value',
       'standard_upper_value', 'src_id', 'type', 'relation', 'value', 'units',
       'text_value', 'standard_text_value', 'action_type'],
      dtype='object')

In [4]:
# Filter columns
columns = ['activity_id', 'assay_id', 'molregno','standard_relation', 'standard_value', 'standard_units', 'standard_type', 'activity_comment',
           'data_validity_comment', 'pchembl_value','standard_upper_value','standard_text_value', 'action_type']
activities = activities[columns]

In [None]:
type_unit = Counter([(i,j) for i,j in zip(activities['standard_type'], activities['standard_units']) if i == 'IC50'])

In [None]:
len(type_unit)

In [None]:
units = Counter(activities['standard_units'].tolist())
units = pd.DataFrame([[i, units[i]] for i in sorted(units, key=lambda x: units[x])[::-1]], columns=['unit', 'count'])

total_count = units['count'].sum()
units['cumulative_prop'] = (units['count'].cumsum() / total_count).round(3)

In [None]:
u = "\n".join(np.array(units['unit'].tolist()).astype(str))
with open(os.path.join(root, "..", "config", "chembl_activities", 'units.csv'), "w") as out:
    out.write(u)

In [None]:
units[:10]

In [None]:
units[:50]