In [1]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import re

pd.set_option("display.max_columns", 100)

In [2]:
# Load activities
root = '.'
activities_all_raw = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_all_raw.csv"), low_memory=False)

In [3]:
# 1. Flagging activity comments
activity_comments_bin = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "activity_comments_manual_curation.csv"), low_memory=False)
activity_comments_act = set(activity_comments_bin[activity_comments_bin['manual_curation'] == 1]['activity_comment'])
activity_comments_inact = set(activity_comments_bin[activity_comments_bin['manual_curation'] == -1]['activity_comment'])

# 2. Flagging standard text
standard_text_bin = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "standard_text_manual_curation.csv"), low_memory=False)
standard_text_act = set(standard_text_bin[standard_text_bin['manual_curation'] == 1]['standard_text_value'])
standard_text_inact = set(standard_text_bin[standard_text_bin['manual_curation'] == -1]['standard_text_value'])

# 3. Unit conversion
unit_conversion = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "unit_conversion.csv"))
standard_unit_to_final_unit = {i: j for i,j in zip(unit_conversion['standard_units'], unit_conversion['final_unit'])}
standard_unit_to_conversion_formula = {i: j for i,j in zip(unit_conversion['standard_units'], unit_conversion['conversion_formula'])}

# X. pChEMBL calculation
def calculate_pchembl(uM):
    try:
        value = uM * 1e-6
        pchembl_value = np.clip(-np.log10(value), 1, 9)
        return pchembl_value
    except:
        return np.nan

# X. Relations py dict
RELATIONS = {"=": "=",
             ">": ">",
             "<": "<",
             ">>": ">",
             ">=": ">",
             "<<": "<",
             "<=": "<",
             np.nan: "=",
             "~": "="}

def convert_relation(i, RELATIONS):
    return RELATIONS[i]

In [4]:
# 1. Cleaning activity comments
print("Cleaning activity comments...")
NEW_ACTIVITY_COMMENT = []
for act_comment in tqdm(activities_all_raw['activity_comment']):
    if str(act_comment) == 'nan':
        NEW_ACTIVITY_COMMENT.append(0)
    elif act_comment in activity_comments_act:
        NEW_ACTIVITY_COMMENT.append(1)
    elif act_comment in activity_comments_inact:
        NEW_ACTIVITY_COMMENT.append(-1)
    else:
        NEW_ACTIVITY_COMMENT.append(0)

activities_all_raw['new_activity_comment'] = NEW_ACTIVITY_COMMENT
print(f"New activity comments: {dict(Counter(activities_all_raw['new_activity_comment']))}")

Cleaning activity comments...


100%|██████████| 24267312/24267312 [00:05<00:00, 4291451.15it/s]


New activity comments: {0: 20359186, -1: 3267587, 1: 640539}


In [5]:
# 2. Cleaning standard text
print("Cleaning standard text...")
NEW_STANDARD_TEXT = []
for std_text_value in tqdm(activities_all_raw['standard_text_value']):
    if str(std_text_value) == 'nan':
        NEW_STANDARD_TEXT.append(0)
    elif std_text_value in standard_text_act:
        NEW_STANDARD_TEXT.append(1)
    elif std_text_value in standard_text_inact:
        NEW_STANDARD_TEXT.append(-1)
    else:
        NEW_STANDARD_TEXT.append(0)

activities_all_raw['new_standard_text_value'] = NEW_STANDARD_TEXT
print(f"New standard text: {dict(Counter(activities_all_raw['new_standard_text_value']))}")

Cleaning standard text...


100%|██████████| 24267312/24267312 [00:05<00:00, 4383954.61it/s]


New standard text: {0: 23963616, 1: 31764, -1: 271932}


In [6]:
# 3. Harmonizing units and values
NEW_VALUES, NEW_UNITS = [], []
print("Harmonizing units and converting values")
for mw, std_value, std_unit in tqdm(activities_all_raw[['MW', 'standard_value', 'standard_units']].values):

    # Get conversion formula
    if std_unit in standard_unit_to_conversion_formula:
        conversion_formula = standard_unit_to_conversion_formula[std_unit]
    else:
        # Only when std_unit is nan
        conversion_formula = np.nan

    # Get final_unit
    if std_unit in standard_unit_to_final_unit:
        final_unit = standard_unit_to_final_unit[std_unit]
    else:
        # Only when std_unit is nan
        final_unit = np.nan
    NEW_UNITS.append(final_unit)

    # Get new value
    if str(std_value) != 'nan':
        if str(conversion_formula) != 'nan':
            data = {'standard_value': std_value, 'molecular_weight': mw}
            new_value = eval(conversion_formula, data)
            NEW_VALUES.append(new_value)
        else:
            NEW_VALUES.append(std_value)
    else:
        NEW_VALUES.append(np.nan)

# Save in df
activities_all_raw['converted_values'] = NEW_VALUES
activities_all_raw['converted_units'] = NEW_UNITS

Harmonizing units and converting values


100%|██████████| 24267312/24267312 [01:28<00:00, 274810.84it/s]


In [7]:
# Saving CSV file with converted units and values
d = dict(Counter(activities_all_raw['converted_units']))
df = [[unit, d[unit]] for unit in sorted(d, key=lambda x: d[x])[::-1]]
df = pd.DataFrame(df, columns=['unit', 'count'])
total_count = np.sum(df['count'])
df['cumulative_prop'] = (df['count'].cumsum() / total_count).round(3)
df.to_csv(os.path.join(root, "..", "config", "chembl_processed", "converted_units.csv"), index=False)

In [8]:
# Dict mapping old units with new units
new_unit_to_old_units = {i: set() for i in set(NEW_UNITS)}
for i,j in zip(activities_all_raw['converted_units'], activities_all_raw['standard_units']):
    new_unit_to_old_units[i].add(j)

df = [[unit, len(new_unit_to_old_units[unit]), " ; ".join([str(k) for k in new_unit_to_old_units[unit]])] 
      for unit in sorted(new_unit_to_old_units, key=lambda x: len(new_unit_to_old_units[x]))[::-1]]
df = pd.DataFrame(df, columns=['unit', 'count', 'old_units'])
df.to_csv(os.path.join(root, "..", "config", "chembl_processed", "converted_units_map.csv"), index=False)

In [9]:
# 4. Harmonizing activity types

def harmonize_act_type(act_type):
    return re.sub(r"[_\s./\\]", "", str(act_type).upper().strip())

# # From harmonized types to types and viceversa
# types = Counter(activities_all_raw['standard_type'])
# harmonized_types = {harmonize_act_type(i): set() for i in types}
# for ty in types:
#     harmonized_types[harmonize_act_type(ty)].add(ty)

HARMONIZED_TYPES = [harmonize_act_type(i) for i in tqdm(activities_all_raw['standard_type'])]
activities_all_raw['harmonized_type'] = HARMONIZED_TYPES

harmonized_types_to_types = {i: set() for i in set(HARMONIZED_TYPES)}
for ty, harm_ty in zip(activities_all_raw['standard_type'], activities_all_raw['harmonized_type']):
    harmonized_types_to_types[harm_ty].add(ty)

100%|██████████| 24267312/24267312 [00:16<00:00, 1462484.39it/s]


In [10]:
len(set(activities_all_raw['standard_type'])), len(set(activities_all_raw['harmonized_type']))

(6449, 6068)

In [11]:
sorted([[i, len(harmonized_types_to_types[i]), sorted(harmonized_types_to_types[i])] for i in harmonized_types_to_types], key = lambda x: x[1])[::-1][:15]

[['PKB', 7, ['PKB', 'PKb', 'PkB', 'pKB', 'pKb', 'pkB', 'pkb']],
 ['PKA', 5, ['PKa', 'Pka', 'pKA', 'pKa', 'pka']],
 ['DELTATM', 5, ['Delta TM', 'Delta Tm', 'DeltaTm', 'delta Tm', 'deltaTm']],
 ['ACTIVITY',
  5,
  ['A ctivity', 'Activ ity', 'Activit y', 'Activity', 'activity']],
 ["LOGK'W", 4, ["Log K'W", "Log K'w", "Log k' w", "Log k'w"]],
 ['KB', 4, ['K B', 'KB', 'Kb', 'kb']],
 ['KR', 4, ['KR', 'Kr', 'kR', 'kr']],
 ['KACT', 4, ['K ACT', 'K act', 'Kact', 'k act']],
 ['2XAPTT', 4, ['2 x APTT', '2x APTT', '2x aPTT', '2xAPTT']],
 ['DELTAT', 4, ['Delta T', 'Delta t', 'delta T', 'deltaT']],
 ['KA', 4, ['K A', 'KA', 'Ka', 'ka']],
 ['LOGKA', 4, ['Log KA', 'Log Ka', 'logKA', 'logKa']],
 ['KH', 4, ['K H', 'KH', 'Kh', 'kH']],
 ['KBAPP', 4, ['K Bapp', 'KB app', 'Kb app', 'Kbapp']],
 ['KOBS', 4, ['K obs', 'Kobs', 'k_obs', 'kobs']]]

In [12]:
# 5. Clean relations
print("Cleaning relations...")
activities_all_raw["standard_relation"] = [convert_relation(i, RELATIONS) for i in tqdm(activities_all_raw["standard_relation"])]
print(dict(Counter(activities_all_raw['standard_relation'])))

Cleaning relations...


100%|██████████| 24267312/24267312 [00:03<00:00, 6204883.68it/s]


{'=': 22283277, '<': 374796, '>': 1609239}


In [13]:
# 6. Calculating pChEMBL
print("Calculating pChEMBL values...")
calculated_pChEMBLs = []
for unit, value, pch in tqdm(activities_all_raw[['converted_units', 'converted_values', 'pchembl_value']].values):
    if str(value) != 'nan' and unit == "umol.L-1":
        value = calculate_pchembl(value)
        calculated_pChEMBLs.append(value)
    else:
        calculated_pChEMBLs.append(np.nan)

activities_all_raw['calculated_pChEMBLs'] = calculated_pChEMBLs

Calculating pChEMBL values...


  pchembl_value = np.clip(-np.log10(value), 1, 9)
  pchembl_value = np.clip(-np.log10(value), 1, 9)
100%|██████████| 24267312/24267312 [01:03<00:00, 380096.85it/s] 


In [14]:
# del activities_all_raw['standard_relation']
# del activities_all_raw['standard_value']
# del activities_all_raw['standard_units']
# del activities_all_raw['standard_type']
# del activities_all_raw['activity_comment']
# del activities_all_raw['standard_text_value']

activities_all_raw = activities_all_raw.rename(columns={
                          "new_activity_comment": "activity_comment",
                          "new_standard_text_value": "standard_text",
                          "converted_values": "value",
                          "converted_units": "unit",
                          "harmonized_type": "activity_type",
                          "pchembl_value": "pchembl",
                          "calculated_pChEMBLs": "pchembl_calculated"
                          })

In [15]:
Counter(activities_all_raw[activities_all_raw['pchembl'].isna() == False]['unit'])

Counter({'umol.L-1': 4896791})

In [26]:
Counter(activities_all_raw[(activities_all_raw['pchembl_calculated'].isna() == False) & (activities_all_raw['pchembl'].isna() == True)]['standard_units'])

Counter({'nM': 7669406,
         'ug.mL-1': 940031,
         'uM': 257124,
         'ug ml-1': 76880,
         'ng/ml': 7737,
         'mM': 7017,
         "10'-2 umol/ml": 3972,
         'mg/dl': 3646,
         'M': 2933,
         'umol/L': 2467,
         'mg/L': 2119,
         'ng ml-1': 1965,
         'mg/ml': 1053,
         'umol/ml': 583,
         "10'-2microM": 540,
         "10'-2mmol/ml": 507,
         "10'-3micromol/ml": 440,
         "10'-2micromol/ml": 349,
         'M l-1': 249,
         'mg l-1': 249,
         'nmol/ml': 223,
         '10^2umol/ml': 192,
         "10'-5M": 162,
         "10'-8M": 126,
         "10'-4M": 112,
         'microg/ml': 98,
         "10'3nM": 49,
         "10'-5 ug/ml": 46,
         "10'-6 ug/ml": 38,
         'microM': 33,
         "10'-7 ug/ml": 30,
         "10'-8 ug/ml": 18,
         'umol/uL': 14,
         'ug l-1': 12,
         '10^-3mM': 8,
         '10^-6 uM': 5,
         "10'4pg/ml": 1})

In [None]:
len(activities_all_raw[activities_all_raw['pchembl_calculated'].isna() == False]) - len(activities_all_raw[activities_all_raw['pchembl'].isna() == False])

In [None]:
pchembls = activities_all_raw[activities_all_raw['pchembl'].isna() == False].reset_index(drop=True)

In [None]:
Counter(np.isclose(pchembls['pchembl'], pchembls['pchembl_calculated'], atol = 1))

In [None]:
activities_all_raw[activities_all_raw['pchembl'].isna() == False].sort_values('value')

In [None]:
activities_all_raw.to_csv(os.path.join(root, "..", "config", 'chembl_processed', 'activities_preprocessed.csv'), index=False)

In [None]:
Counter(activities_all_raw['activity_type'])

In [None]:
d = dict(Counter(activities_all_raw['target_organism']))

In [None]:
d