In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import os

root = "."
activities_all_raw = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)

In [6]:
# 8. Converting activity types to their corresponding synonyms
synonyms = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "synonyms.csv"))
for activity, syns in zip(synonyms['activity'], synonyms['synonyms']):
    for syn in syns.split(";"):
        activities_all_raw.loc[activities_all_raw['activity_type'] == syn, 'activity_type'] = activity

In [7]:
# 9. Keeping only the canonical unit per activity type
# Get pair counts
s = activities_all_raw[["activity_type", "unit"]].astype("string").fillna("")
out = (
s.value_counts(subset=["activity_type", "unit"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True))

# Keep the most occurent pairs and discard the others
idx = out.groupby("activity_type")['count'].idxmax()
out["keep"] = 0
out.loc[idx, "keep"] = 1

# Get canonical unit per activity type
canonical = (
    out[out["keep"] == 1]
    .set_index("activity_type")[["unit"]]
    .rename(columns={"unit": "canonical_unit"}))
canonical_map = canonical["canonical_unit"].to_dict()
activities_all_raw["canonical_unit"] = activities_all_raw["activity_type"].map(canonical_map)

# Specify canonical unit per activity type
out = out.merge(canonical, on="activity_type", how="left")

# Save pair summary
out.to_csv(os.path.join(root, "..", "config", "chembl_processed", "activity_type_unit_pairs.csv"), index=False)

In [9]:
out

Unnamed: 0,activity_type,unit,count,keep,canonical_unit
0,POTENCY,umol.L-1,4465197,1,umol.L-1
1,IC50,umol.L-1,3346464,1,umol.L-1
2,GI50,umol.L-1,2537503,1,umol.L-1
3,INHIBITION,%,1926847,1,%
4,PERCENTEFFECT,%,1328350,1,%
...,...,...,...,...,...
12668,ED100,umol,1,0,ug.mg-1
12669,ECG,,1,1,
12670,ED145,ug/kg,1,1,ug/kg
12671,ED195,ug/kg,1,1,ug/kg


In [4]:
# Filter out non-canonical activity-unit pairs
keep_mask = (
    (activities_all_raw["unit"] == activities_all_raw["canonical_unit"]) |
    (activities_all_raw["unit"].isna() & activities_all_raw["canonical_unit"].isna())
)
a = activities_all_raw[keep_mask].reset_index(drop=True)

In [5]:
a

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,assay_organism,doc_chembl_id,tid,target_type,target_organism,...,MW,pchembl,activity_comment,standard_text,value,unit,activity_type,relation,pchembl_calculated,canonical_unit
0,31863,54505,CHEMBL663853,B,8,,CHEMBL1137930,63,SINGLE PROTEIN,Homo sapiens,...,312.328,,0,0,100.000,umol.L-1,IC50,>,4.000000,umol.L-1
1,31864,83907,CHEMBL872937,B,8,,CHEMBL1146658,11653,SINGLE PROTEIN,Homo sapiens,...,398.374,5.60,0,0,2.500,umol.L-1,IC50,=,5.602060,umol.L-1
2,31865,88152,CHEMBL693237,F,1,Homo sapiens,CHEMBL1146658,22221,NON-MOLECULAR,,...,398.374,,0,0,50.000,umol.L-1,IC50,>,4.301030,umol.L-1
3,31866,83907,CHEMBL872937,B,8,,CHEMBL1146658,11653,SINGLE PROTEIN,Homo sapiens,...,520.497,5.05,0,0,9.000,umol.L-1,IC50,=,5.045757,umol.L-1
4,31867,88153,CHEMBL693238,F,1,Homo sapiens,CHEMBL1146658,22221,NON-MOLECULAR,,...,520.497,,0,0,,umol.L-1,IC50,=,,umol.L-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22486138,29054631,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,436.468,,0,0,,,KON,=,,
22486139,29054632,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,436.468,,0,0,,s-1,KOFF,=,,s-1
22486140,29054633,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,435.484,6.30,0,0,0.503,umol.L-1,IC50,=,6.298432,umol.L-1
22486141,29054634,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,435.484,,0,0,,,KON,=,,


In [6]:
sum(out[out['keep'] == 1]['count'].tolist())

22486224

In [7]:
sum(out[out['keep'] == 0]['count'].tolist())

1554763

In [8]:
22486224+1554763

24040987

In [9]:
out[out['keep'] == 1]

Unnamed: 0,activity_type,unit,count,keep,canonical_unit
0,POTENCY,umol.L-1,4465197,1,umol.L-1
1,IC50,umol.L-1,3346464,1,umol.L-1
2,GI50,umol.L-1,2537503,1,umol.L-1
3,INHIBITION,%,1926847,1,%
4,PERCENTEFFECT,%,1328350,1,%
...,...,...,...,...,...
12649,EC91,umol.L-1,1,1,umol.L-1
12655,FULLINHIBITION,umol.L-1,1,1,umol.L-1
12669,ECG,,1,1,
12670,ED145,ug/kg,1,1,ug/kg


In [None]:
# out[out['keep'] == 0].reset_index(drop=True).drop(columns={'keep'}).to_csv(os.path.join())