In [1]:
import json
import os
import pandas as pd
import re
from pathlib import Path

In [2]:
workdir = os.path.dirname(os.getcwd())
input_dir = Path(f"{workdir}/8_drug_category/data/")
drug_classes = pd.read_csv(os.path.join(input_dir, "PubChem_ATC_Code.csv"))
drug_classes = drug_classes.iloc[:, [1, 2, 39]].drop_duplicates()

In [3]:
def extract_drugbank_id(s):
    if pd.isna(s):
        return None
    match = re.search(r'\bDB\d{5}\b', s)
    if match:
        return match.group(0)
    return None

def extract_atc_level2(annotation):
    if pd.isna(annotation):
        return None
    
    # EXACT Level 2 only
    matches = re.findall(r'\b([A-Z][0-9]{2})\b\s*-\s*([^>|]+)', annotation)
    
    return matches if matches else None

In [14]:
drug_classes["drugbank_id"] = drug_classes["cmpdsynonym"].apply(extract_drugbank_id)
drug_classes["atc_level2"] = drug_classes["annotation"].apply(extract_atc_level2)

drug_atc_df = (
    drug_classes[["cmpdname", "drugbank_id", "atc_level2"]]
    .dropna(subset=["atc_level2"])
    .explode("atc_level2")
)

drug_atc_df[["atc_code", "atc_label"]] = pd.DataFrame(
    drug_atc_df["atc_level2"].tolist(),
    index=drug_atc_df.index
)

drug_atc_df = drug_atc_df.drop(columns="atc_level2")

drug_atc_df = drug_atc_df[
    drug_atc_df["atc_code"].str.match(r'^[A-Z][0-9]{2}$')
]

drug_atc_df = drug_atc_df.dropna(subset=["drugbank_id"])
drug_atc_df.to_csv(os.path.join(input_dir, "drugbank_atc_level2.csv"), index = False)


In [13]:
drug_atc_df

Unnamed: 0,cmpdname,drugbank_id,atc_code,atc_label
1,Aminolevulinic Acid,DB00855,L01,Antineoplastic agents
3,Acetic Acid,DB03166,G01,Gynecological antiinfectives and antiseptics
3,Acetic Acid,DB03166,S02,Otologicals
4,Acetylcholine,DB03128,S01,Ophthalmologicals
5,Quinacrine,DB01103,P01,Antiprotozoals
...,...,...,...,...
4017,Copanlisib,DB12483,L01,Antineoplastic agents
4018,Pafolacianine,DB15413,V04,Diagnostic agents
4020,Dabigatran Etexilate,DB06695,B01,Antithrombotic agents
4022,Opicapone,DB11632,N04,Anti-parkinson drugs


In [7]:
## this capture all the levels

# def extract_atc_info(annotation):
#     if pd.isna(annotation):
#         return None
#     matches = re.findall(r'([A-Z][0-9A-Z]{1,6})\s*-\s*([^>|]+)', annotation)
#     return matches if matches else None

# drug_classes["atc_info"] = drug_classes["annotation"].apply(extract_atc_info)

# drug_atc_df = (
#     drug_classes[["cmpdname", "drugbank_id", "atc_info"]]
#     .dropna(subset=["atc_info"])
#     .explode("atc_info")
# )

# drug_atc_df[["atc_code", "atc_label"]] = pd.DataFrame(
#     drug_atc_df["atc_info"].tolist(),
#     index=drug_atc_df.index
# )

# drug_atc_df = drug_atc_df.drop(columns="atc_info")

# drug_atc_df = drug_atc_df.dropna(subset=["drugbank_id"])

# drug_atc_df.to_csv(os.path.join(input_dir, "drugbank_atc.csv"), index = False)
# drug_atc_df


Unnamed: 0,cmpdname,drugbank_id,atc_code,atc_label
1,Aminolevulinic Acid,DB00855,C1420,Photosensitizing Agent
1,Aminolevulinic Acid,DB00855,D003879,Dermatologic Agents
1,Aminolevulinic Acid,DB00855,D011838,Radiation-Sensitizing Agents
1,Aminolevulinic Acid,DB00855,D017319,Photosensitizing Agents
1,Aminolevulinic Acid,DB00855,L01,Antineoplastic agents
...,...,...,...,...
4065,Sotorasib,DB15569,C1902,Ras Inhibitor
4065,Sotorasib,DB15569,C274,Antineoplastic Agent
4065,Sotorasib,DB15569,C2189,Signal Transduction Inhibitor
4065,Sotorasib,DB15569,D000970,Antineoplastic Agents


In [20]:
# drug_atc_df["is_immunosuppressant"] = (drug_atc_df["atc_code"].str.startswith("L04"))
# drug_atc_df.loc[drug_atc_df["is_immunosuppressant"]]

In [21]:
# extract json file
# Group by drugbank_id and collect unique ATC entries
grouped = (
    drug_atc_df.groupby("drugbank_id")[["atc_code", "atc_label"]]
      .apply(lambda x: [
          {"atc_code": code, "atc_label": label}
          for code, label in zip(x["atc_code"], x["atc_label"])
      ])
      .to_dict()
)



# Remove duplicates inside each drug (if any)
for drug in grouped:
    unique = { (d["atc_code"], d["atc_label"]) for d in grouped[drug] }
    grouped[drug] = [
        {"atc_code": code, "atc_label": label}
        for code, label in unique
    ]


# Save to JSON
with open(os.path.join(input_dir,"drugbank_atc.json"), "w") as f:
    json.dump(grouped, f, indent=4)

print("JSON file created.")


JSON file created.
