In [1]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [2]:
# Load activities
root = '.'
activities_all_raw = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_all_raw.csv"), low_memory=False)

In [3]:
# 1. Flagging activity comments
activity_comments_bin = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "activity_comments_manual_curation.csv"), low_memory=False)
activity_comments_act = set(activity_comments_bin[activity_comments_bin['manual_curation'] == 1]['activity_comment'])
activity_comments_inact = set(activity_comments_bin[activity_comments_bin['manual_curation'] == -1]['activity_comment'])

# 2. Flagging standard text
standard_text_bin = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "standard_text_manual_curation.csv"), low_memory=False)
standard_text_act = set(standard_text_bin[standard_text_bin['manual_curation'] == 1]['standard_text_value'])
standard_text_inact = set(standard_text_bin[standard_text_bin['manual_curation'] == -1]['standard_text_value'])

# 3. Unit conversion
unit_conversion = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "unit_conversion.csv"))
standard_unit_to_final_unit = {i: j for i,j in zip(unit_conversion['standard_units'], unit_conversion['final_unit'])}
standard_unit_to_conversion_formula = {i: j for i,j in zip(unit_conversion['standard_units'], unit_conversion['conversion_formula'])}

# 4. pChEMBL calculation
def calculate_pchembl(uM):
    value = uM * 1e-6
    pchembl_value = np.clip(-np.log10(value), 1, 9)
    return pchembl_value

# 5. Relations py dict
RELATIONS = {"=": "=",
             ">": ">",
             "<": "<",
             ">>": ">",
             ">=": ">",
             "<<": "<",
             "<=": "<",
             "~": "="}

def convert_relation(i, RELATIONS):
    try:
        return RELATIONS[i]
    except:
        return np.nan

In [27]:
# 1. Cleaning activity comments
print("Cleaning activity comments...")
NEW_ACTIVITY_COMMENT = []
for act_comment in tqdm(activities_all_raw['activity_comment']):
    if str(act_comment) == 'nan':
        NEW_ACTIVITY_COMMENT.append(0)
    elif act_comment in activity_comments_act:
        NEW_ACTIVITY_COMMENT.append(1)
    elif act_comment in activity_comments_inact:
        NEW_ACTIVITY_COMMENT.append(-1)
    else:
        NEW_ACTIVITY_COMMENT.append(0)

activities_all_raw['activity_comment'] = NEW_ACTIVITY_COMMENT
print(f"New activity comments: {dict(Counter(activities_all_raw['activity_comment']))}")

Cleaning activity comments...


100%|██████████| 24267312/24267312 [00:05<00:00, 4175197.99it/s]


New activity comments: {0: 20359186, -1: 3267587, 1: 640539}


In [28]:
# 2. Cleaning standard text
print("Cleaning standard text...")
NEW_STANDARD_TEXT = []
for std_text_value in tqdm(activities_all_raw['standard_text_value']):
    if str(std_text_value) == 'nan':
        NEW_STANDARD_TEXT.append(0)
    elif std_text_value in standard_text_act:
        NEW_STANDARD_TEXT.append(1)
    elif std_text_value in standard_text_inact:
        NEW_STANDARD_TEXT.append(-1)
    else:
        NEW_STANDARD_TEXT.append(0)

activities_all_raw['standard_text_value'] = NEW_STANDARD_TEXT
print(f"New standard text: {dict(Counter(activities_all_raw['standard_text_value']))}")

Cleaning standard text...


100%|██████████| 24267312/24267312 [00:05<00:00, 4522295.84it/s]


New standard text: {0: 23963616, 1: 31764, -1: 271932}


In [32]:
activities_all_raw

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,tid,target_type,target_organism,target_chembl_id,target_tax_id,compound_chembl_id,canonical_smiles,MW,standard_relation,standard_value,standard_units,standard_type,activity_comment,pchembl_value,standard_text_value
0,400689,159498,CHEMBL768385,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,0.2,ug ml-1,ED99,0,,0
1,400690,196969,CHEMBL882153,F,1,50085,ORGANISM,Human rhinovirus sp.,CHEMBL612470,169066.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,20.0,ug ml-1,CyD50,0,,0
2,400691,196970,CHEMBL799318,F,1,50085,ORGANISM,Human rhinovirus sp.,CHEMBL612470,169066.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,2.0,ug ml-1,ED99,0,,0
3,400692,232617,CHEMBL851081,A,0,22224,ADMET,,CHEMBL612558,,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,25.0,,Therapeutic index 99,0,,0
4,400693,232618,CHEMBL851082,A,0,22224,ADMET,,CHEMBL612558,,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,10.0,,Therapeutic index 99,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24267307,400684,232617,CHEMBL851081,A,0,22224,ADMET,,CHEMBL612558,,CHEMBL164861,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1,330.292,=,33.3,,Therapeutic index 99,0,,0
24267308,400685,159502,CHEMBL768389,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,10.0,ug ml-1,Max non-toxic dose,0,,0
24267309,400686,159504,CHEMBL766871,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,100000.0,,Reduction factor,0,,0
24267310,400687,159509,CHEMBL766876,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,0.5,ug ml-1,Minimal dose,0,,0


In [47]:
# 3. Standardizing units and values
NEW_STD_VALUES, NEW_STD_UNITS = [], []
for mw, std_value, std_unit in tqdm(activities_all_raw[['MW', 'standard_value', 'standard_units']].values):
    data = {'standard_value': std_value, 'molecular_weight': mw}
    final_unit = standard_unit_to_final_unit[std_unit]
    conversion_formula = standard_unit_to_conversion_formula[std_unit]
    print(std_value, std_unit, mw, final_unit, conversion_formula)
    break

  0%|          | 0/24267312 [00:00<?, ?it/s]


0.2 ug ml-1 360.318 umol.L-1 standard_value*1000/molecular_weight


In [34]:
act_types = set(activities_all_raw['standard_type'])

In [42]:
len(set([i.strip().lower() if type(i) == str else np.nan for i in act_types]))

6220

In [29]:
# 5. Clean relations
print("Cleaning relations...")
activities_all_raw["standard_relation"] = [convert_relation(i, RELATIONS) for i in tqdm(activities_all_raw["standard_relation"])]
print(dict(Counter(activities_all_raw['standard_relation'])))

Cleaning relations...


100%|██████████| 24267312/24267312 [00:04<00:00, 4901371.29it/s]


{'=': 14877309, '<': 374796, '>': 1609239, nan: 7405968}


In [31]:
Counter(activities_all_raw['assay_confidence_score'])

Counter({1: 10313587,
         9: 6303554,
         8: 3841462,
         0: 3001986,
         5: 196643,
         6: 184473,
         7: 142953,
         3: 131165,
         4: 122342,
         2: 29147})