In [1]:
import csv, os

# set your paths
meta = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/meta"
cui_file = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui.txt"
out_csv   = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui_rxnorm_atc.csv"

mrconso   = os.path.join(meta, "MRCONSO.RRF")

# load cuis of interest
with open(cui_file) as f:
    target = {line.strip() for line in f if line.strip()}

# preference for rxnorm name
rx_pref = ["PSN","IN","SCD","SBD","MIN","PIN","BN","SY"]

rx_name  = {}                # cui -> (tty, name)
atc_map  = {}                # cui -> set[(code, name)]

def maybe_set_name(cui, tty, name):
    cur = rx_name.get(cui)
    if cur is None:
        rx_name[cui] = (tty, name)
    else:
        try:
            if rx_pref.index(tty) < rx_pref.index(cur[0]):
                rx_name[cui] = (tty, name)
        except ValueError:
            pass

with open(mrconso, encoding="utf-8") as f:
    for line in f:
        cols = line.rstrip("\n").split("|")
        if len(cols) < 18: 
            continue
        cui, lat, sab, tty, code, name = cols[0], cols[1], cols[11], cols[12], cols[13], cols[14]
        if cui not in target: 
            continue

        # rxnorm preferred english name
        if sab == "RXNORM" and lat == "ENG":
            if tty in rx_pref:
                maybe_set_name(cui, tty, name)

        # atc codes attached to this cui
        if sab == "ATC":
            atc_map.setdefault(cui, set()).add((code, name))

# write output
with open(out_csv, "w", newline="", encoding="utf-8") as w:
    cw = csv.writer(w)
    cw.writerow(["cui","rxnorm_tty","rxnorm_name","atc_codes","atc_labels"])
    for cui in sorted(target):
        tty,name = rx_name.get(cui, ("",""))
        codes  = ";".join(sorted({c for c,_ in atc_map.get(cui, set())}))
        labels = ";".join(sorted({n for _,n in atc_map.get(cui, set())}))
        cw.writerow([cui, tty, name, codes, labels])

print(f"✅ wrote: {out_csv}")


✅ wrote: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui_rxnorm_atc.csv


In [4]:
import os, pandas as pd

meta = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/meta"
mrconso = os.path.join(meta, "MRCONSO.RRF")
in_csv  = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui_rxnorm_atc.csv"
out_long  = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_long.csv"
out_pretty= "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_pretty.csv"

# build atc code -> name
code2name = {}
with open(mrconso, encoding="utf-8") as f:
    for line in f:
        cols = line.rstrip("\n").split("|")
        if len(cols) >= 15 and cols[11] == "ATC":
            code2name.setdefault(cols[13], cols[14])

def levels_from(code: str):
    code = (code or "").strip()
    L1 = code[:1] if len(code) >= 1 else ""
    L2 = code[:3] if len(code) >= 3 else ""
    L3 = code[:4] if len(code) >= 4 else ""
    L4 = code[:5] if len(code) >= 5 else ""
    L5 = code if len(code) >= 7 else ""
    return {
        "L1_code": L1, "L1_name": code2name.get(L1, ""),
        "L2_code": L2, "L2_name": code2name.get(L2, ""),
        "L3_code": L3, "L3_name": code2name.get(L3, ""),
        "L4_code": L4, "L4_name": code2name.get(L4, ""),
        "L5_code": L5, "L5_name": code2name.get(L5, ""),
    }

# read your csv as strings to avoid NaN->float
df = pd.read_csv(in_csv, dtype=str).fillna("")
rows = []

for _, r in df.iterrows():
    cui = r.get("cui", "")
    rx_tty = r.get("rxnorm_tty", "")
    rx_name = r.get("rxnorm_name", "")
    atc_field = str(r.get("atc_codes", "") or "")
    if not atc_field.strip():
        continue

    for code in [c.strip() for c in atc_field.split(";") if c and c.strip()]:
        lv = levels_from(code)
        rows.append({
            "cui": cui,
            "rxnorm_tty": rx_tty,
            "rxnorm_name": rx_name,
            "atc_code": code,
            "atc_name": code2name.get(code, ""),
            **lv
        })

long_df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
long_df.to_csv(out_long, index=False)

def ladder_text(lv):
    parts = []
    if lv["L1_code"]: parts.append(f'{lv["L1_code"]} {lv["L1_name"]}')
    if lv["L2_code"]: parts.append(f'{lv["L2_code"]} {lv["L2_name"]}')
    if lv["L3_code"]: parts.append(f'{lv["L3_code"]} {lv["L3_name"]}')
    if lv["L4_code"]: parts.append(f'{lv["L4_code"]} {lv["L4_name"]}')
    if lv["L5_code"]: parts.append(f'{lv["L5_code"]} {lv["L5_name"]}')
    return "\n".join(parts)

pretty = (long_df
          .assign(ladder=lambda d: d.apply(lambda r: ladder_text({
              "L1_code": r["L1_code"], "L1_name": r["L1_name"],
              "L2_code": r["L2_code"], "L2_name": r["L2_name"],
              "L3_code": r["L3_code"], "L3_name": r["L3_name"],
              "L4_code": r["L4_code"], "L4_name": r["L4_name"],
              "L5_code": r["L5_code"], "L5_name": r["L5_name"],
          }), axis=1))
          [["cui","rxnorm_name","atc_code","ladder"]]
          .drop_duplicates())

pretty.to_csv(out_pretty, index=False)

print("wrote:")
print(" -", out_long)
print(" -", out_pretty)

display(long_df.head(20))


wrote:
 - /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_long.csv
 - /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_pretty.csv


Unnamed: 0,cui,rxnorm_tty,rxnorm_name,atc_code,atc_name,L1_code,L1_name,L2_code,L2_name,L3_code,L3_name,L4_code,L4_name,L5_code,L5_name
0,C0000378,IN,droxidopa,C01CA27,droxidopa,C,CARDIOVASCULAR SYSTEM DRUGS,C01,CARDIAC THERAPY DRUGS,C01C,CARDIAC STIMULANTS EXCL. CARDIAC GLYCOSIDES,C01CA,Adrenergic and dopaminergic cardiac stimulants,C01CA27,droxidopa
1,C0000402,,,C10AX05,meglutol,C,CARDIOVASCULAR SYSTEM DRUGS,C10,LIPID MODIFYING AGENTS,C10A,"LIPID MODIFYING AGENTS, PLAIN",C10AX,Other lipid modifying agents in ATC,C10AX05,meglutol
2,C0000473,IN,4-aminobenzoic acid,D02BA01,aminobenzoic acid,D,DERMATOLOGICALS,D02,EMOLLIENTS AND PROTECTIVES,D02B,PROTECTIVES AGAINST UV-RADIATION,D02BA,Protectives against UV-radiation for topical use,D02BA01,aminobenzoic acid
3,C0000477,IN,dalfampridine,N07XX07,fampridine,N,NERVOUS SYSTEM DRUGS,N07,OTHER NERVOUS SYSTEM DRUGS in ATC,N07X,OTHER NERVOUS SYSTEM DRUGS in ATC,N07XX,Other nervous system drugs in ATC,N07XX07,fampridine
4,C0000578,IN,5-hydroxytryptophan,N06AX01,oxitriptan,N,NERVOUS SYSTEM DRUGS,N06,PSYCHOANALEPTICS,N06A,ANTIDEPRESSANTS,N06AX,Other antidepressants in ATC,N06AX01,oxitriptan
5,C0000608,IN,6-aminocaproic acid,B02AA01,aminocaproic acid,B,BLOOD AND BLOOD FORMING ORGAN DRUGS,B02,ANTIHEMORRHAGICS,B02A,ANTIFIBRINOLYTICS,B02AA,Antifibrinolytic amino acids,B02AA01,aminocaproic acid
6,C0000618,IN,mercaptopurine,L01BB02,mercaptopurine,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L01,ANTINEOPLASTIC AGENTS,L01B,ANTIMETABOLITES,L01BB,"Purine analogs, antimetabolites antineoplastic",L01BB02,mercaptopurine
7,C0000665,IN,oxyquinoline,A01AB07,oxyquinoline,A,ALIMENTARY TRACT AND METABOLISM DRUGS,A01,STOMATOLOGICAL PREPARATIONS,A01A,STOMATOLOGICAL PREPARATIONS,A01AB,Antiinfectives and antiseptics for local oral ...,A01AB07,oxyquinoline
8,C0000665,IN,oxyquinoline,D08AH03,oxyquinoline,D,DERMATOLOGICALS,D08,ANTISEPTICS AND DISINFECTANTS,D08A,ANTISEPTICS AND DISINFECTANTS,D08AH,"Quinoline derivatives, antiseptics and disinfe...",D08AH03,oxyquinoline
9,C0000665,IN,oxyquinoline,G01AC30,oxyquinoline,G,GENITO URINARY SYSTEM AND SEX HORMONES,G01,GYNECOLOGICAL ANTIINFECTIVES AND ANTISEPTICS,G01A,"ANTIINFECTIVES AND ANTISEPTICS, EXCL. COMBINAT...",G01AC,Quinoline gynecological antiinfectives,G01AC30,oxyquinoline


In [5]:
import pandas as pd

# path to your file
file_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs.csv"

# load
df = pd.read_csv(file_path)

# drop "type" since you don't need it
df = df.drop(columns=["type"])

# count unique pmids per cui
counts = df.groupby("cui")["pmid"].nunique().reset_index()

# rename for clarity
counts = counts.rename(columns={"pmid": "unique_pub_count"})

# save to same folder
out_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs_counts.csv"
counts.to_csv(out_path, index=False)

counts.head()


Unnamed: 0,cui,unique_pub_count
0,C0000334,80
1,C0000378,8
2,C0000379,16
3,C0000392,70
4,C0000402,42


In [6]:
import pandas as pd

# paths
counts_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs_counts.csv"
atc_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_long.csv"

# load both
counts = pd.read_csv(counts_path)
atc = pd.read_csv(atc_path)

# merge on CUI
merged = atc.merge(counts, on="cui", how="left")

# fill NaN counts with 0 (if some CUIs had no publications)
merged["unique_pub_count"] = merged["unique_pub_count"].fillna(0).astype(int)

# save out
out_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs_atc_merged.csv"
merged.to_csv(out_path, index=False)

merged.head()


Unnamed: 0,cui,rxnorm_tty,rxnorm_name,atc_code,atc_name,L1_code,L1_name,L2_code,L2_name,L3_code,L3_name,L4_code,L4_name,L5_code,L5_name,unique_pub_count
0,C0000378,IN,droxidopa,C01CA27,droxidopa,C,CARDIOVASCULAR SYSTEM DRUGS,C01,CARDIAC THERAPY DRUGS,C01C,CARDIAC STIMULANTS EXCL. CARDIAC GLYCOSIDES,C01CA,Adrenergic and dopaminergic cardiac stimulants,C01CA27,droxidopa,8
1,C0000402,,,C10AX05,meglutol,C,CARDIOVASCULAR SYSTEM DRUGS,C10,LIPID MODIFYING AGENTS,C10A,"LIPID MODIFYING AGENTS, PLAIN",C10AX,Other lipid modifying agents in ATC,C10AX05,meglutol,42
2,C0000473,IN,4-aminobenzoic acid,D02BA01,aminobenzoic acid,D,DERMATOLOGICALS,D02,EMOLLIENTS AND PROTECTIVES,D02B,PROTECTIVES AGAINST UV-RADIATION,D02BA,Protectives against UV-radiation for topical use,D02BA01,aminobenzoic acid,56
3,C0000477,IN,dalfampridine,N07XX07,fampridine,N,NERVOUS SYSTEM DRUGS,N07,OTHER NERVOUS SYSTEM DRUGS in ATC,N07X,OTHER NERVOUS SYSTEM DRUGS in ATC,N07XX,Other nervous system drugs in ATC,N07XX07,fampridine,18
4,C0000578,IN,5-hydroxytryptophan,N06AX01,oxitriptan,N,NERVOUS SYSTEM DRUGS,N06,PSYCHOANALEPTICS,N06A,ANTIDEPRESSANTS,N06AX,Other antidepressants in ATC,N06AX01,oxitriptan,21


In [None]:
import os, re, unicodedata, pandas as pd

# --- paths
base = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform"
meta = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/meta"

ped_atc = f"{base}/pedpubs_atc_merged.csv"         # from previous step
prr_file= f"{base}/final_filtered_data_with_prr_keywords_excluded_with_abcd.txt"
mrconso  = f"{meta}/MRCONSO.RRF"

# --- load data
atc = pd.read_csv(ped_atc, dtype=str).fillna("")
prr = pd.read_csv(prr_file, sep="$", dtype=str).fillna("")

# keep only the columns we need
prr = prr.rename(columns={"DRUG":"drug"})
if "type" in prr.columns:
    prr = prr.drop(columns=["type"], errors="ignore")

cuis_of_interest = set(atc["cui"].unique())

# --- normalization helpers
_PUNCT = re.compile(r"[^a-z0-9 ]+")
_WS    = re.compile(r"\s+")

def normalize(s: str) -> str:
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = s.lower()
    s = s.replace("&", " and ")
    s = _PUNCT.sub(" ", s)
    s = _WS.sub(" ", s).strip()
    return s

# --- build name variants from MRCONSO for just our CUIs (RxNorm English)
rx_ttys = {"PSN","IN","PIN","MIN","BN","SCD","SBD","SY"}  # generous
name_to_cuis = {}      # normalized name -> set(CUI)
cui_to_names = {}      # CUI -> list of raw names

with open(mrconso, encoding="utf-8") as f:
    for line in f:
        cols = line.rstrip("\n").split("|")
        if len(cols) < 15: 
            continue
        cui, lat, sab, tty, name = cols[0], cols[1], cols[11], cols[12], cols[14]
        if cui not in cuis_of_interest: 
            continue
        if sab != "RXNORM" or lat != "ENG" or tty not in rx_ttys:
            continue
        n = normalize(name)
        if not n: 
            continue
        name_to_cuis.setdefault(n, set()).add(cui)
        cui_to_names.setdefault(cui, []).append(name)

# --- pass 1: exact normalized match
prr["drug_norm"] = prr["drug"].map(normalize)
match_cui = []
match_type= []

for s in prr["drug_norm"]:
    cuis = name_to_cuis.get(s, set())
    if len(cuis) == 1:
        match_cui.append(next(iter(cuis)))
        match_type.append("exact")
    elif len(cuis) > 1:
        match_cui.append(";".join(sorted(cuis)))
        match_type.append("exact_ambiguous")
    else:
        match_cui.append("")
        match_type.append("")

prr["match_cui"] = match_cui
prr["match_type"] = match_type

unmatched = prr[prr["match_cui"]==""].copy()

# --- pass 2: split combo drugs and try exact on each piece
DELIMS = r"[\/\\\+\-,;:]"
def split_combo(s):
    return [normalize(t) for t in re.split(DELIMS, s) if normalize(t)]

split_matches = []
for idx, row in unmatched.iterrows():
    parts = split_combo(row["drug"])
    part_hits = set()
    for p in parts:
        part_hits |= name_to_cuis.get(p, set())
    if part_hits:
        split_matches.append((idx, ";".join(sorted(part_hits))))
        
# apply
for idx, cuis in split_matches:
    prr.loc[idx, "match_cui"]  = cuis
    prr.loc[idx, "match_type"] = "split_exact"

# --- pass 3: fuzzy (token set) using rapidfuzz
try:
    from rapidfuzz import process, fuzz
    do_fuzzy = True
except Exception:
    do_fuzzy = False
    print("rapidfuzz not installed; skipping fuzzy pass. pip install rapidfuzz to enable.")

if do_fuzzy:
    # candidate list = all variant names (raw + normalized map)
    cand_names = list(name_to_cuis.keys())  # normalized names
    # search only remaining unmatched rows
    still = prr[prr["match_cui"]==""].copy()
    for idx, row in still.iterrows():
        q = row["drug_norm"]
        if not q: 
            continue
        best = process.extractOne(
            q, cand_names, scorer=fuzz.token_set_ratio
        )
        if best and best[1] >= 92:  # threshold; adjust if needed
            hit_norm = best[0]
            cuis = name_to_cuis.get(hit_norm, set())
            if cuis:
                prr.loc[idx, "match_cui"]  = ";".join(sorted(cuis))
                prr.loc[idx, "match_type"] = f"fuzzy_{best[1]}"
    # optional: keep a score column
    # (already embedded in match_type as fuzzy_SCORE)

# --- expand multi-CUI matches into rows (each CUI gets the PRR record)
def explode_cuis(df):
    df = df[df["match_cui"]!=""].copy()
    df["cui"] = df["match_cui"].str.split(";")
    return df.explode("cui").drop(columns=["match_cui"])

matched = explode_cuis(prr)
unmatched_final = prr[prr["match_cui"]==""].copy()

# --- attach counts/ATC: join on CUI
counts = pd.read_csv(f"{base}/pedpubs_counts.csv", dtype={"cui":str})
atc_long = pd.read_csv(f"{base}/atc_hierarchy_long.csv", dtype={"cui":str})

matched = matched.merge(counts, on="cui", how="left")
matched = matched.merge(atc_long, on="cui", how="left")

# --- save outputs
out_all   = f"{base}/prr_matched_with_cui_atc.csv"
out_unmat = f"{base}/prr_unmatched_for_review.csv"
matched.to_csv(out_all, index=False)
unmatched_final.to_csv(out_unmat, index=False)

print("saved:")
print(" -", out_all)
print(" -", out_unmat)

matched.head(10)


In [8]:
pip install rapidfuzz

Defaulting to user installation because normal site-packages is not writeable
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.
