In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import os

root = "."
activities_all_raw = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)
activities_all_raw = activities_all_raw[activities_all_raw['activity_type'].isna() == False].reset_index(drop=True)

In [2]:
# 8. Converting activity types to their corresponding synonyms
synonyms = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "synonyms.csv"))
for activity, syns in zip(synonyms['activity'], synonyms['synonyms']):
    for syn in syns.split(";"):
        activities_all_raw.loc[activities_all_raw['activity_type'] == syn, 'activity_type'] = activity

In [3]:
# 9. Keeping only the canonical unit per activity type
# Get pair counts
s = activities_all_raw[["activity_type", "unit"]]
out = (
s.value_counts(subset=["activity_type", "unit"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True))

# Keep the most occurring pairs and discard the others
idx = out.groupby("activity_type")['count'].idxmax()
out["keep"] = 0
out.loc[idx, "keep"] = 1

# Get canonical unit per activity type
canonical = (
    out[out["keep"] == 1]
    .set_index("activity_type")[["unit"]]
    .rename(columns={"unit": "canonical_unit"}))
canonical_map = canonical["canonical_unit"].to_dict()
activities_all_raw["canonical_unit"] = activities_all_raw["activity_type"].map(canonical_map)

# Specify canonical unit per activity type
out = out.merge(canonical, on="activity_type", how="left")

# Save pair summary
out.to_csv(os.path.join(root, "..", "config", "chembl_processed", "activity_type_unit_pairs.csv"), index=False)

In [4]:
# Filter out non-canonical activity-unit pairs
keep_mask = (
    (activities_all_raw["unit"] == activities_all_raw["canonical_unit"]) | 
    (activities_all_raw["unit"].isna() & activities_all_raw["canonical_unit"].isna())
)
a = activities_all_raw[keep_mask].reset_index(drop=True)
len(a)

22486116

In [5]:
sum(out[out['keep'] == 1]['count'].tolist())

22486116

In [6]:
sum(out[out['keep'] == 0]['count'].tolist())

1554667

In [None]:
# out[out['keep'] == 0].reset_index(drop=True).drop(columns={'keep'}).to_csv(os.path.join())