In [120]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import re

pd.set_option("display.max_columns", 100)

In [2]:
# Load activities
root = '.'
activities_all_raw = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_all_raw.csv"), low_memory=False)

In [5]:
# 1. Flagging activity comments
activity_comments_bin = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "activity_comments_manual_curation.csv"), low_memory=False)
activity_comments_act = set(activity_comments_bin[activity_comments_bin['manual_curation'] == 1]['activity_comment'])
activity_comments_inact = set(activity_comments_bin[activity_comments_bin['manual_curation'] == -1]['activity_comment'])

# 2. Flagging standard text
standard_text_bin = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "standard_text_manual_curation.csv"), low_memory=False)
standard_text_act = set(standard_text_bin[standard_text_bin['manual_curation'] == 1]['standard_text_value'])
standard_text_inact = set(standard_text_bin[standard_text_bin['manual_curation'] == -1]['standard_text_value'])

# 3. Unit conversion
unit_conversion = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "unit_conversion.csv"))
standard_unit_to_final_unit = {i: j for i,j in zip(unit_conversion['standard_units'], unit_conversion['final_unit'])}
standard_unit_to_conversion_formula = {i: j for i,j in zip(unit_conversion['standard_units'], unit_conversion['conversion_formula'])}

# X. pChEMBL calculation
def calculate_pchembl(uM):
    value = uM * 1e-6
    pchembl_value = np.clip(-np.log10(value), 1, 9)
    return pchembl_value

# X. Relations py dict
RELATIONS = {"=": "=",
             ">": ">",
             "<": "<",
             ">>": ">",
             ">=": ">",
             "<<": "<",
             "<=": "<",
             "~": "="}

def convert_relation(i, RELATIONS):
    try:
        return RELATIONS[i]
    except:
        return np.nan

In [6]:
# 1. Cleaning activity comments
print("Cleaning activity comments...")
NEW_ACTIVITY_COMMENT = []
for act_comment in tqdm(activities_all_raw['activity_comment']):
    if str(act_comment) == 'nan':
        NEW_ACTIVITY_COMMENT.append(0)
    elif act_comment in activity_comments_act:
        NEW_ACTIVITY_COMMENT.append(1)
    elif act_comment in activity_comments_inact:
        NEW_ACTIVITY_COMMENT.append(-1)
    else:
        NEW_ACTIVITY_COMMENT.append(0)

activities_all_raw['new_activity_comment'] = NEW_ACTIVITY_COMMENT
print(f"New activity comments: {dict(Counter(activities_all_raw['new_activity_comment']))}")

Cleaning activity comments...


100%|██████████| 24267312/24267312 [00:05<00:00, 4310488.53it/s]


New activity comments: {0: 20359186, -1: 3267587, 1: 640539}


In [10]:
# 2. Cleaning standard text
print("Cleaning standard text...")
NEW_STANDARD_TEXT = []
for std_text_value in tqdm(activities_all_raw['standard_text_value']):
    if str(std_text_value) == 'nan':
        NEW_STANDARD_TEXT.append(0)
    elif std_text_value in standard_text_act:
        NEW_STANDARD_TEXT.append(1)
    elif std_text_value in standard_text_inact:
        NEW_STANDARD_TEXT.append(-1)
    else:
        NEW_STANDARD_TEXT.append(0)

activities_all_raw['new_standard_text_value'] = NEW_STANDARD_TEXT
print(f"New standard text: {dict(Counter(activities_all_raw['new_standard_text_value']))}")

Cleaning standard text...


100%|██████████| 24267312/24267312 [00:05<00:00, 4538884.45it/s]


New standard text: {0: 23963616, 1: 31764, -1: 271932}


In [45]:
# 3. Harmonizing units and values
NEW_VALUES, NEW_UNITS = [], []
for mw, std_value, std_unit in tqdm(activities_all_raw[['MW', 'standard_value', 'standard_units']].values):

    # Get conversion formula
    if std_unit in standard_unit_to_conversion_formula:
        conversion_formula = standard_unit_to_conversion_formula[std_unit]
    else:
        # Only when std_unit is nan
        conversion_formula = np.nan

    # Get final_unit
    if std_unit in standard_unit_to_final_unit:
        final_unit = standard_unit_to_final_unit[std_unit]
    else:
        # Only when std_unit is nan
        final_unit = np.nan
    NEW_UNITS.append(final_unit)

    # Get new value
    if str(std_value) != 'nan':
        if str(conversion_formula) != 'nan':
            data = {'standard_value': std_value, 'molecular_weight': mw}
            new_value = eval(conversion_formula, data)
            NEW_VALUES.append(new_value)
        else:
            NEW_VALUES.append(std_value)
    else:
        NEW_VALUES.append(np.nan)

100%|██████████| 24267312/24267312 [01:26<00:00, 279062.62it/s]


In [46]:
activities_all_raw['new_values'] = NEW_VALUES
activities_all_raw['new_units'] = NEW_UNITS

In [91]:
new_unit_to_old_units = {i: set() for i in set(NEW_UNITS)}
for i,j in zip(activities_all_raw['new_units'], activities_all_raw['standard_units']):
    new_unit_to_old_units[i].add(j)

In [165]:
# 4. Harmonizing activity types

def harmonize_act_type(act_type):
    return re.sub(r"[_\s./\\]", "", str(act_type).upper().strip())

# # From harmonized types to types and viceversa
# types = Counter(activities_all_raw['standard_type'])
# harmonized_types = {harmonize_act_type(i): set() for i in types}
# for ty in types:
#     harmonized_types[harmonize_act_type(ty)].add(ty)

HARMONIZED_TYPES = [harmonize_act_type(i) for i in tqdm(activities_all_raw['standard_type'])]
activities_all_raw['harmonized_type'] = HARMONIZED_TYPES

harmonized_types_to_types = {i: set() for i in set(HARMONIZED_TYPES)}
for ty, harm_ty in zip(activities_all_raw['standard_type'], activities_all_raw['harmonized_type']):
    harmonized_types_to_types[harm_ty].add(ty)

100%|██████████| 24267312/24267312 [00:16<00:00, 1516123.40it/s]


In [167]:
len(set(activities_all_raw['standard_type'])), len(set(activities_all_raw['harmonized_type']))

(6449, 6068)

In [175]:
sorted([[i, len(harmonized_types_to_types[i]), sorted(harmonized_types_to_types[i])] for i in harmonized_types_to_types], key = lambda x: x[1])[::-1]

[['PKB', 7, ['PKB', 'PKb', 'PkB', 'pKB', 'pKb', 'pkB', 'pkb']],
 ['PKA', 5, ['PKa', 'Pka', 'pKA', 'pKa', 'pka']],
 ['DELTATM', 5, ['Delta TM', 'Delta Tm', 'DeltaTm', 'delta Tm', 'deltaTm']],
 ['ACTIVITY',
  5,
  ['A ctivity', 'Activ ity', 'Activit y', 'Activity', 'activity']],
 ['KA', 4, ['K A', 'KA', 'Ka', 'ka']],
 ['2XAPTT', 4, ['2 x APTT', '2x APTT', '2x aPTT', '2xAPTT']],
 ['KB', 4, ['K B', 'KB', 'Kb', 'kb']],
 ['KR', 4, ['KR', 'Kr', 'kR', 'kr']],
 ['LOGKA', 4, ['Log KA', 'Log Ka', 'logKA', 'logKa']],
 ['KOBS', 4, ['K obs', 'Kobs', 'k_obs', 'kobs']],
 ['KBAPP', 4, ['K Bapp', 'KB app', 'Kb app', 'Kbapp']],
 ['KACT', 4, ['K ACT', 'K act', 'Kact', 'k act']],
 ['KH', 4, ['K H', 'KH', 'Kh', 'kH']],
 ['DELTAT', 4, ['Delta T', 'Delta t', 'delta T', 'deltaT']],
 ["LOGK'W", 4, ["Log K'W", "Log K'w", "Log k' w", "Log k'w"]],
 ['KINACT', 3, ['K inact', 'Kinact', 'kinact']],
 ["PD'2", 3, ["PD' 2", "PD'2", "pD'2"]],
 ['KIKI', 3, ['Ki/KI', 'Ki/ki', 'ki/Ki']],
 ['KE', 3, ['KE', 'Ke', 'ke']],
 ['P

In [178]:
# 5. Clean relations
print("Cleaning relations...")
activities_all_raw["standard_relation"] = [convert_relation(i, RELATIONS) for i in tqdm(activities_all_raw["standard_relation"])]
print(dict(Counter(activities_all_raw['standard_relation'])))

Cleaning relations...


100%|██████████| 24267312/24267312 [00:04<00:00, 5079492.14it/s]


{'=': 14877309, '<': 374796, '>': 1609239, nan: 7405968}


In [200]:
# 6. Calculating pChEMBL
print("Calculating pChEMBL values...")
calculated_pChEMBLs = []
for unit, value, pch in tqdm(activities_all_raw[['new_units', 'new_values', 'pchembl_value']].values[84200:84300]):
    if str(pch) == 'nan':
        if str(value) != 'nan' and unit == "umol.L-1":
            print(value)
            value = calculate_pchembl(value)
            calculated_pChEMBLs.append(value)
        else:
            calculated_pChEMBLs.append(np.nan)
    else:
        calculated_pChEMBLs.append(np.nan)

activities_all_raw['calculated_pChEMBLs'] = calculated_pChEMBLs

Calculating pChEMBL values...


  pchembl_value = np.clip(-np.log10(value), 1, 9)
100%|██████████| 100/100 [00:00<00:00, 253126.37it/s]


1.911589008363202
3.3452807646356035
5.176291549445231
25.881457747226154
2.628431746198631
55.8541746067209
235.0
100.0
1.0
10000.0
10000.0
10.0
10.0
0.05680494522152333
0.0
0.09693833955526221
5.340482459185363
5.340482459185363
5.340482459185363
1.3351206147963408


ValueError: Length of values (100) does not match length of index (24267312)

In [192]:
activities_all_raw[activities_all_raw['calculated_pChEMBLs'].isna() == False]

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,tid,target_type,target_organism,target_chembl_id,target_tax_id,compound_chembl_id,canonical_smiles,MW,standard_relation,standard_value,standard_units,standard_type,activity_comment,pchembl_value,standard_text_value,new_activity_comment,new_standard_text_value,new_values,new_units,harmonized_type,calculated_pChEMBLs
0,400689,159498,CHEMBL768385,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,0.2,ug ml-1,ED99,,,,0,0,0.555065,umol.L-1,ED99,6.255656
1,400690,196969,CHEMBL882153,F,1,50085,ORGANISM,Human rhinovirus sp.,CHEMBL612470,169066.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,20.0,ug ml-1,CyD50,,,,0,0,55.506525,umol.L-1,CYD50,4.255656
2,400691,196970,CHEMBL799318,F,1,50085,ORGANISM,Human rhinovirus sp.,CHEMBL612470,169066.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,2.0,ug ml-1,ED99,,,,0,0,5.550652,umol.L-1,ED99,5.255656
5,400694,159502,CHEMBL768389,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL162912,COc1ccc(-c2oc3cccc(OC)c3c(=O)c2OC)cc1,312.321,=,100.0,ug ml-1,Max non-toxic dose,,,,0,0,320.183401,umol.L-1,MAXNON-TOXICDOSE,3.494601
26,400715,38405,CHEMBL648907,F,1,50174,ORGANISM,Bacteroides fragilis,CHEMBL614411,817.0,CHEMBL77542,O=C(c1ccc(Cl)cc1Cl)C(Cl)(Cn1ccnc1)c1ccccc1,379.674,<,0.8,ug.mL-1,MIC,,,,0,0,2.107071,umol.L-1,MIC,5.676321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24267305,400682,159497,CHEMBL769455,A,1,80583,CELL-LINE,Chlorocebus sabaeus,CHEMBL391,60711.0,CHEMBL164861,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1,330.292,=,10.0,ug ml-1,CyD50,,,,0,0,30.276240,umol.L-1,CYD50,4.518898
24267306,400683,159498,CHEMBL768385,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL164861,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1,330.292,=,0.3,ug ml-1,ED99,,,,0,0,0.908287,umol.L-1,ED99,6.041777
24267308,400685,159502,CHEMBL768389,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,10.0,ug ml-1,Max non-toxic dose,,,,0,0,27.753262,umol.L-1,MAXNON-TOXICDOSE,4.556686
24267310,400687,159509,CHEMBL766876,F,1,50076,ORGANISM,Enterovirus C,CHEMBL612462,138950.0,CHEMBL349973,COc1cc(O)c2c(=O)c(OC)c(-c3ccc(O)c(O)c3)oc2c1OC,360.318,=,0.5,ug ml-1,Minimal dose,,,,0,0,1.387663,umol.L-1,MINIMALDOSE,5.857716
