In [1]:
import csv, os

# set your paths
meta = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/meta"
cui_file = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui.txt"
out_csv   = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui_rxnorm_atc.csv"

mrconso   = os.path.join(meta, "MRCONSO.RRF")

# load cuis of interest
with open(cui_file) as f:
    target = {line.strip() for line in f if line.strip()}

# preference for rxnorm name
rx_pref = ["PSN","IN","SCD","SBD","MIN","PIN","BN","SY"]

rx_name  = {}                # cui -> (tty, name)
atc_map  = {}                # cui -> set[(code, name)]

def maybe_set_name(cui, tty, name):
    cur = rx_name.get(cui)
    if cur is None:
        rx_name[cui] = (tty, name)
    else:
        try:
            if rx_pref.index(tty) < rx_pref.index(cur[0]):
                rx_name[cui] = (tty, name)
        except ValueError:
            pass

with open(mrconso, encoding="utf-8") as f:
    for line in f:
        cols = line.rstrip("\n").split("|")
        if len(cols) < 18: 
            continue
        cui, lat, sab, tty, code, name = cols[0], cols[1], cols[11], cols[12], cols[13], cols[14]
        if cui not in target: 
            continue

        # rxnorm preferred english name
        if sab == "RXNORM" and lat == "ENG":
            if tty in rx_pref:
                maybe_set_name(cui, tty, name)

        # atc codes attached to this cui
        if sab == "ATC":
            atc_map.setdefault(cui, set()).add((code, name))

# write output
with open(out_csv, "w", newline="", encoding="utf-8") as w:
    cw = csv.writer(w)
    cw.writerow(["cui","rxnorm_tty","rxnorm_name","atc_codes","atc_labels"])
    for cui in sorted(target):
        tty,name = rx_name.get(cui, ("",""))
        codes  = ";".join(sorted({c for c,_ in atc_map.get(cui, set())}))
        labels = ";".join(sorted({n for _,n in atc_map.get(cui, set())}))
        cw.writerow([cui, tty, name, codes, labels])

print(f"✅ wrote: {out_csv}")


✅ wrote: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui_rxnorm_atc.csv


In [4]:
import os, pandas as pd

meta = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/meta"
mrconso = os.path.join(meta, "MRCONSO.RRF")
in_csv  = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cui_rxnorm_atc.csv"
out_long  = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_long.csv"
out_pretty= "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_pretty.csv"

# build atc code -> name
code2name = {}
with open(mrconso, encoding="utf-8") as f:
    for line in f:
        cols = line.rstrip("\n").split("|")
        if len(cols) >= 15 and cols[11] == "ATC":
            code2name.setdefault(cols[13], cols[14])

def levels_from(code: str):
    code = (code or "").strip()
    L1 = code[:1] if len(code) >= 1 else ""
    L2 = code[:3] if len(code) >= 3 else ""
    L3 = code[:4] if len(code) >= 4 else ""
    L4 = code[:5] if len(code) >= 5 else ""
    L5 = code if len(code) >= 7 else ""
    return {
        "L1_code": L1, "L1_name": code2name.get(L1, ""),
        "L2_code": L2, "L2_name": code2name.get(L2, ""),
        "L3_code": L3, "L3_name": code2name.get(L3, ""),
        "L4_code": L4, "L4_name": code2name.get(L4, ""),
        "L5_code": L5, "L5_name": code2name.get(L5, ""),
    }

# read your csv as strings to avoid NaN->float
df = pd.read_csv(in_csv, dtype=str).fillna("")
rows = []

for _, r in df.iterrows():
    cui = r.get("cui", "")
    rx_tty = r.get("rxnorm_tty", "")
    rx_name = r.get("rxnorm_name", "")
    atc_field = str(r.get("atc_codes", "") or "")
    if not atc_field.strip():
        continue

    for code in [c.strip() for c in atc_field.split(";") if c and c.strip()]:
        lv = levels_from(code)
        rows.append({
            "cui": cui,
            "rxnorm_tty": rx_tty,
            "rxnorm_name": rx_name,
            "atc_code": code,
            "atc_name": code2name.get(code, ""),
            **lv
        })

long_df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
long_df.to_csv(out_long, index=False)

def ladder_text(lv):
    parts = []
    if lv["L1_code"]: parts.append(f'{lv["L1_code"]} {lv["L1_name"]}')
    if lv["L2_code"]: parts.append(f'{lv["L2_code"]} {lv["L2_name"]}')
    if lv["L3_code"]: parts.append(f'{lv["L3_code"]} {lv["L3_name"]}')
    if lv["L4_code"]: parts.append(f'{lv["L4_code"]} {lv["L4_name"]}')
    if lv["L5_code"]: parts.append(f'{lv["L5_code"]} {lv["L5_name"]}')
    return "\n".join(parts)

pretty = (long_df
          .assign(ladder=lambda d: d.apply(lambda r: ladder_text({
              "L1_code": r["L1_code"], "L1_name": r["L1_name"],
              "L2_code": r["L2_code"], "L2_name": r["L2_name"],
              "L3_code": r["L3_code"], "L3_name": r["L3_name"],
              "L4_code": r["L4_code"], "L4_name": r["L4_name"],
              "L5_code": r["L5_code"], "L5_name": r["L5_name"],
          }), axis=1))
          [["cui","rxnorm_name","atc_code","ladder"]]
          .drop_duplicates())

pretty.to_csv(out_pretty, index=False)

print("wrote:")
print(" -", out_long)
print(" -", out_pretty)

display(long_df.head(20))


wrote:
 - /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_long.csv
 - /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_pretty.csv


Unnamed: 0,cui,rxnorm_tty,rxnorm_name,atc_code,atc_name,L1_code,L1_name,L2_code,L2_name,L3_code,L3_name,L4_code,L4_name,L5_code,L5_name
0,C0000378,IN,droxidopa,C01CA27,droxidopa,C,CARDIOVASCULAR SYSTEM DRUGS,C01,CARDIAC THERAPY DRUGS,C01C,CARDIAC STIMULANTS EXCL. CARDIAC GLYCOSIDES,C01CA,Adrenergic and dopaminergic cardiac stimulants,C01CA27,droxidopa
1,C0000402,,,C10AX05,meglutol,C,CARDIOVASCULAR SYSTEM DRUGS,C10,LIPID MODIFYING AGENTS,C10A,"LIPID MODIFYING AGENTS, PLAIN",C10AX,Other lipid modifying agents in ATC,C10AX05,meglutol
2,C0000473,IN,4-aminobenzoic acid,D02BA01,aminobenzoic acid,D,DERMATOLOGICALS,D02,EMOLLIENTS AND PROTECTIVES,D02B,PROTECTIVES AGAINST UV-RADIATION,D02BA,Protectives against UV-radiation for topical use,D02BA01,aminobenzoic acid
3,C0000477,IN,dalfampridine,N07XX07,fampridine,N,NERVOUS SYSTEM DRUGS,N07,OTHER NERVOUS SYSTEM DRUGS in ATC,N07X,OTHER NERVOUS SYSTEM DRUGS in ATC,N07XX,Other nervous system drugs in ATC,N07XX07,fampridine
4,C0000578,IN,5-hydroxytryptophan,N06AX01,oxitriptan,N,NERVOUS SYSTEM DRUGS,N06,PSYCHOANALEPTICS,N06A,ANTIDEPRESSANTS,N06AX,Other antidepressants in ATC,N06AX01,oxitriptan
5,C0000608,IN,6-aminocaproic acid,B02AA01,aminocaproic acid,B,BLOOD AND BLOOD FORMING ORGAN DRUGS,B02,ANTIHEMORRHAGICS,B02A,ANTIFIBRINOLYTICS,B02AA,Antifibrinolytic amino acids,B02AA01,aminocaproic acid
6,C0000618,IN,mercaptopurine,L01BB02,mercaptopurine,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L01,ANTINEOPLASTIC AGENTS,L01B,ANTIMETABOLITES,L01BB,"Purine analogs, antimetabolites antineoplastic",L01BB02,mercaptopurine
7,C0000665,IN,oxyquinoline,A01AB07,oxyquinoline,A,ALIMENTARY TRACT AND METABOLISM DRUGS,A01,STOMATOLOGICAL PREPARATIONS,A01A,STOMATOLOGICAL PREPARATIONS,A01AB,Antiinfectives and antiseptics for local oral ...,A01AB07,oxyquinoline
8,C0000665,IN,oxyquinoline,D08AH03,oxyquinoline,D,DERMATOLOGICALS,D08,ANTISEPTICS AND DISINFECTANTS,D08A,ANTISEPTICS AND DISINFECTANTS,D08AH,"Quinoline derivatives, antiseptics and disinfe...",D08AH03,oxyquinoline
9,C0000665,IN,oxyquinoline,G01AC30,oxyquinoline,G,GENITO URINARY SYSTEM AND SEX HORMONES,G01,GYNECOLOGICAL ANTIINFECTIVES AND ANTISEPTICS,G01A,"ANTIINFECTIVES AND ANTISEPTICS, EXCL. COMBINAT...",G01AC,Quinoline gynecological antiinfectives,G01AC30,oxyquinoline


In [5]:
import pandas as pd

# path to your file
file_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs.csv"

# load
df = pd.read_csv(file_path)

# drop "type" since you don't need it
df = df.drop(columns=["type"])

# count unique pmids per cui
counts = df.groupby("cui")["pmid"].nunique().reset_index()

# rename for clarity
counts = counts.rename(columns={"pmid": "unique_pub_count"})

# save to same folder
out_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs_counts.csv"
counts.to_csv(out_path, index=False)

counts.head()


Unnamed: 0,cui,unique_pub_count
0,C0000334,80
1,C0000378,8
2,C0000379,16
3,C0000392,70
4,C0000402,42


In [6]:
import pandas as pd

# paths
counts_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs_counts.csv"
atc_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/atc_hierarchy_long.csv"

# load both
counts = pd.read_csv(counts_path)
atc = pd.read_csv(atc_path)

# merge on CUI
merged = atc.merge(counts, on="cui", how="left")

# fill NaN counts with 0 (if some CUIs had no publications)
merged["unique_pub_count"] = merged["unique_pub_count"].fillna(0).astype(int)

# save out
out_path = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/pedpubs_atc_merged.csv"
merged.to_csv(out_path, index=False)

merged.head()


Unnamed: 0,cui,rxnorm_tty,rxnorm_name,atc_code,atc_name,L1_code,L1_name,L2_code,L2_name,L3_code,L3_name,L4_code,L4_name,L5_code,L5_name,unique_pub_count
0,C0000378,IN,droxidopa,C01CA27,droxidopa,C,CARDIOVASCULAR SYSTEM DRUGS,C01,CARDIAC THERAPY DRUGS,C01C,CARDIAC STIMULANTS EXCL. CARDIAC GLYCOSIDES,C01CA,Adrenergic and dopaminergic cardiac stimulants,C01CA27,droxidopa,8
1,C0000402,,,C10AX05,meglutol,C,CARDIOVASCULAR SYSTEM DRUGS,C10,LIPID MODIFYING AGENTS,C10A,"LIPID MODIFYING AGENTS, PLAIN",C10AX,Other lipid modifying agents in ATC,C10AX05,meglutol,42
2,C0000473,IN,4-aminobenzoic acid,D02BA01,aminobenzoic acid,D,DERMATOLOGICALS,D02,EMOLLIENTS AND PROTECTIVES,D02B,PROTECTIVES AGAINST UV-RADIATION,D02BA,Protectives against UV-radiation for topical use,D02BA01,aminobenzoic acid,56
3,C0000477,IN,dalfampridine,N07XX07,fampridine,N,NERVOUS SYSTEM DRUGS,N07,OTHER NERVOUS SYSTEM DRUGS in ATC,N07X,OTHER NERVOUS SYSTEM DRUGS in ATC,N07XX,Other nervous system drugs in ATC,N07XX07,fampridine,18
4,C0000578,IN,5-hydroxytryptophan,N06AX01,oxitriptan,N,NERVOUS SYSTEM DRUGS,N06,PSYCHOANALEPTICS,N06A,ANTIDEPRESSANTS,N06AX,Other antidepressants in ATC,N06AX01,oxitriptan,21


In [1]:
import os, re, unicodedata, pandas as pd

# --- paths
base = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform"
meta = "/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/meta"

ped_atc = f"{base}/pedpubs_atc_merged.csv"         # from previous step
prr_file= f"{base}/final_filtered_data_with_prr_keywords_excluded_with_abcd.txt"
mrconso  = f"{meta}/MRCONSO.RRF"

# --- load data
atc = pd.read_csv(ped_atc, dtype=str).fillna("")
prr = pd.read_csv(prr_file, sep="$", dtype=str).fillna("")

# keep only the columns we need
prr = prr.rename(columns={"DRUG":"drug"})
if "type" in prr.columns:
    prr = prr.drop(columns=["type"], errors="ignore")

cuis_of_interest = set(atc["cui"].unique())

# --- normalization helpers
_PUNCT = re.compile(r"[^a-z0-9 ]+")
_WS    = re.compile(r"\s+")

def normalize(s: str) -> str:
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = s.lower()
    s = s.replace("&", " and ")
    s = _PUNCT.sub(" ", s)
    s = _WS.sub(" ", s).strip()
    return s

# --- build name variants from MRCONSO for just our CUIs (RxNorm English)
rx_ttys = {"PSN","IN","PIN","MIN","BN","SCD","SBD","SY"}  # generous
name_to_cuis = {}      # normalized name -> set(CUI)
cui_to_names = {}      # CUI -> list of raw names

with open(mrconso, encoding="utf-8") as f:
    for line in f:
        cols = line.rstrip("\n").split("|")
        if len(cols) < 15: 
            continue
        cui, lat, sab, tty, name = cols[0], cols[1], cols[11], cols[12], cols[14]
        if cui not in cuis_of_interest: 
            continue
        if sab != "RXNORM" or lat != "ENG" or tty not in rx_ttys:
            continue
        n = normalize(name)
        if not n: 
            continue
        name_to_cuis.setdefault(n, set()).add(cui)
        cui_to_names.setdefault(cui, []).append(name)

# --- pass 1: exact normalized match
prr["drug_norm"] = prr["drug"].map(normalize)
match_cui = []
match_type= []

for s in prr["drug_norm"]:
    cuis = name_to_cuis.get(s, set())
    if len(cuis) == 1:
        match_cui.append(next(iter(cuis)))
        match_type.append("exact")
    elif len(cuis) > 1:
        match_cui.append(";".join(sorted(cuis)))
        match_type.append("exact_ambiguous")
    else:
        match_cui.append("")
        match_type.append("")

prr["match_cui"] = match_cui
prr["match_type"] = match_type

unmatched = prr[prr["match_cui"]==""].copy()

# --- pass 2: split combo drugs and try exact on each piece
DELIMS = r"[\/\\\+\-,;:]"
def split_combo(s):
    return [normalize(t) for t in re.split(DELIMS, s) if normalize(t)]

split_matches = []
for idx, row in unmatched.iterrows():
    parts = split_combo(row["drug"])
    part_hits = set()
    for p in parts:
        part_hits |= name_to_cuis.get(p, set())
    if part_hits:
        split_matches.append((idx, ";".join(sorted(part_hits))))
        
# apply
for idx, cuis in split_matches:
    prr.loc[idx, "match_cui"]  = cuis
    prr.loc[idx, "match_type"] = "split_exact"

# --- pass 3: fuzzy (token set) using rapidfuzz
try:
    from rapidfuzz import process, fuzz
    do_fuzzy = True
except Exception:
    do_fuzzy = False
    print("rapidfuzz not installed; skipping fuzzy pass. pip install rapidfuzz to enable.")

if do_fuzzy:
    # candidate list = all variant names (raw + normalized map)
    cand_names = list(name_to_cuis.keys())  # normalized names
    # search only remaining unmatched rows
    still = prr[prr["match_cui"]==""].copy()
    for idx, row in still.iterrows():
        q = row["drug_norm"]
        if not q: 
            continue
        best = process.extractOne(
            q, cand_names, scorer=fuzz.token_set_ratio
        )
        if best and best[1] >= 92:  # threshold; adjust if needed
            hit_norm = best[0]
            cuis = name_to_cuis.get(hit_norm, set())
            if cuis:
                prr.loc[idx, "match_cui"]  = ";".join(sorted(cuis))
                prr.loc[idx, "match_type"] = f"fuzzy_{best[1]}"
    # optional: keep a score column
    # (already embedded in match_type as fuzzy_SCORE)

# --- expand multi-CUI matches into rows (each CUI gets the PRR record)
def explode_cuis(df):
    df = df[df["match_cui"]!=""].copy()
    df["cui"] = df["match_cui"].str.split(";")
    return df.explode("cui").drop(columns=["match_cui"])

matched = explode_cuis(prr)
unmatched_final = prr[prr["match_cui"]==""].copy()

# --- attach counts/ATC: join on CUI
counts = pd.read_csv(f"{base}/pedpubs_counts.csv", dtype={"cui":str})
atc_long = pd.read_csv(f"{base}/atc_hierarchy_long.csv", dtype={"cui":str})

matched = matched.merge(counts, on="cui", how="left")
matched = matched.merge(atc_long, on="cui", how="left")

# --- save outputs
out_all   = f"{base}/prr_matched_with_cui_atc.csv"
out_unmat = f"{base}/prr_unmatched_for_review.csv"
matched.to_csv(out_all, index=False)
unmatched_final.to_csv(out_unmat, index=False)

print("saved:")
print(" -", out_all)
print(" -", out_unmat)

matched.head(10)


saved:
 - /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/prr_matched_with_cui_atc.csv
 - /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/prr_unmatched_for_review.csv


Unnamed: 0,drug,ADE,Age,Gender,A,B,C,D,PRR,drug_norm,...,L1_code,L1_name,L2_code,L2_name,L3_code,L3_name,L4_code,L4_name,L5_code,L5_name
0,ABACAVIR\LAMIVUDINE,myoclonic epilepsy,0-1 month,F,1,4,0,6007,1502.75,abacavir lamivudine,...,J,ANTIINFECTIVES FOR SYSTEMIC USE,J05,ANTIVIRALS FOR SYSTEMIC USE,J05A,DIRECT ACTING ANTIVIRALS,J05AF,Nucleoside and nucleotide reverse transcriptas...,J05AF05,lamivudine
1,ABACAVIR\LAMIVUDINE,myoclonic epilepsy,0-1 month,F,1,4,0,6007,1502.75,abacavir lamivudine,...,J,ANTIINFECTIVES FOR SYSTEMIC USE,J05,ANTIVIRALS FOR SYSTEMIC USE,J05A,DIRECT ACTING ANTIVIRALS,J05AF,Nucleoside and nucleotide reverse transcriptas...,J05AF06,abacavir
2,ACETAMINOPHEN,atrial septal defect,0-1 month,F,1,19,13,5979,22.548872180451127,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
3,ACETAMINOPHEN,caesarean section,0-1 month,F,1,9,13,5989,47.6031746031746,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
4,ACETAMINOPHEN,extravasation,0-1 month,F,1,5,13,5993,85.68571428571428,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
5,ACETAMINOPHEN,extremity necrosis,0-1 month,F,1,5,13,5993,85.68571428571428,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
6,ACETAMINOPHEN,foetal growth restriction,0-1 month,F,1,67,13,5931,6.394456289978677,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
7,ACETAMINOPHEN,foetal heart rate abnormal,0-1 month,F,1,2,13,5996,214.2142857142857,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
8,ACETAMINOPHEN,haemangioma,0-1 month,F,1,8,13,5990,53.55357142857143,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol
9,ACETAMINOPHEN,lip swelling,0-1 month,F,1,4,13,5994,107.10714285714286,acetaminophen,...,N,NERVOUS SYSTEM DRUGS,N02,ANALGESICS,N02B,OTHER ANALGESICS AND ANTIPYRETICS in ATC,N02BE,Anilide analgesics and antipyretics,N02BE01,paracetamol


In [8]:
pip install rapidfuzz

Defaulting to user installation because normal site-packages is not writeable
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
from pathlib import Path
import re

# ---- paths
base = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform")
pubs_path = base / "pedpubs_atc_merged.csv"
rx_path   = base / "rx_vol.csv"
out_long  = base / "rx_vol_joined_to_umls.csv"          # long form (cui, atc, agegroup, freq)
out_diag  = base / "rx_vol_match_diagnostics.csv"       # mapping audit

# ---- try fuzzy as last resort (optional)
try:
    from rapidfuzz import fuzz, process
    HAVE_FUZZ = True
except Exception:
    HAVE_FUZZ = False

# -----------------------------
# helpers
# -----------------------------
SALT_WORDS = {
    "hydrochloride","sulfate","sulphate","acetate","phosphate","nitrate","tartrate",
    "maleate","fumarate","mesylate","besylate","carbonate","bicarbonate","citrate",
    "oxalate","lactate","succinate","gluconate","hydrobromide","hydrobromid",
    "hydrochlorid","chloride","bitartrate","bismuth","sodium","potassium","magnesium",
    "calcium","zinc","aluminum","ammonium","di","tri","mono","anhydrous","dihydrate",
    "monohydrate","trihydrate","hydrate","micronized","topical","ophthalmic","oral",
    "extended","immediate","release","er","xr","sr","cr",
}

UK_US = {
    "aciclovir":"acyclovir",
    "adrenaline":"epinephrine",
    "noradrenaline":"norepinephrine",
    "oestradiol":"estradiol",
    "oestrogen":"estrogen",
    "methaemoglobin":"methemoglobin",
}

SPLIT_PAT = re.compile(r"[\/\+\&\,]| and ")   # split combos like "a/b", "a + b", "a & b", "a and b"

def norm(s: str) -> str:
    if pd.isna(s): return ""
    s = s.lower()
    # swap common UK→US spellings early
    for uk, us in UK_US.items():
        s = re.sub(rf"\b{re.escape(uk)}\b", us, s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def strip_salts(tokens):
    return [t for t in tokens if t and (t not in SALT_WORDS)]

def base_name(s: str) -> str:
    """remove salts/forms -> core ingredient words"""
    toks = norm(s).split()
    toks = strip_salts(toks)
    return " ".join(toks)

def ingredients(s: str):
    """split combos into ingredient bases"""
    parts = [p.strip() for p in SPLIT_PAT.split(str(s))]
    return [base_name(p) for p in parts if base_name(p)]

# -----------------------------
# load
# -----------------------------
pubs = pd.read_csv(pubs_path, dtype=str)
rx   = pd.read_csv(rx_path, dtype={"drug":str, "agegroup":str, "freq":float})

# clean pubs: drop rows without rxnorm_name
if "rxnorm_name" not in pubs.columns:
    raise ValueError("pedpubs_atc_merged.csv must have 'rxnorm_name'.")
pubs["rxnorm_name"] = pubs["rxnorm_name"].fillna("").astype(str)
pubs = pubs[pubs["rxnorm_name"].str.strip() != ""].copy()

# normalize reference (RxNorm) names
pubs["rxnorm_norm"] = pubs["rxnorm_name"].map(norm)
pubs["rxnorm_base"] = pubs["rxnorm_name"].map(base_name)

# build lookup sets
rxnorm_set = set(pubs["rxnorm_norm"].unique())
rxnorm_base_set = set(pubs["rxnorm_base"].unique())

# drug candidates by norm -> rows
rxnorm_to_rows = pubs.groupby("rxnorm_norm").apply(lambda d: d.index.tolist()).to_dict()
rxnorm_base_to_rows = pubs.groupby("rxnorm_base").apply(lambda d: d.index.tolist()).to_dict()

# prep rx_vol
rx["drug"] = rx["drug"].fillna("").astype(str)
rx["agegroup"] = rx["agegroup"].fillna("").astype(str)
rx["freq"] = rx["freq"].fillna(0).astype(float)

rx["drug_norm"] = rx["drug"].map(norm)
rx["drug_base"] = rx["drug"].map(base_name)
rx["drug_ings"] = rx["drug"].map(ingredients)   # list of ingredient base names

# -----------------------------
# matching routine
# -----------------------------
def match_one(drug_row) -> list[tuple[int, str]]:
    """
    Return list of (pubs_index, rule) matches.
    Strategy order:
      1) exact full norm
      2) exact base (salt stripped)
      3) any ingredient matches rxnorm_base
      4) fuzzy on base (>=92) if rapidfuzz present
    We allow multiple matches (e.g., combos). Caller can fan-out the freq.
    """
    dn  = drug_row["drug_norm"]
    db  = drug_row["drug_base"]
    ings = drug_row["drug_ings"]

    # 1) exact full norm
    if dn in rxnorm_to_rows:
        return [(idx, "exact_full") for idx in rxnorm_to_rows[dn]]

    # 2) exact base
    if db in rxnorm_base_to_rows:
        return [(idx, "exact_base") for idx in rxnorm_base_to_rows[db]]

    # 3) any ingredient base
    hits = []
    for ing in ings:
        if ing in rxnorm_base_to_rows:
            hits.extend([(idx, f"ingredient:{ing}") for idx in rxnorm_base_to_rows[ing]])
    if hits:
        return hits

    # 4) fuzzy fallback on base
    if HAVE_FUZZ and rxnorm_base_set:
        # top 1 fuzzy candidate
        best, score, _ = process.extractOne(db, rxnorm_base_set, scorer=fuzz.token_set_ratio)
        if score >= 92 and best in rxnorm_base_to_rows:
            return [(idx, f"fuzzy_base:{score}") for idx in rxnorm_base_to_rows[best]]

    return []  # no match

# -----------------------------
# perform matching (fan-out combos)
# -----------------------------
matches = []
for i, row in rx.iterrows():
    found = match_one(row)
    if not found:
        matches.append({
            "drug": row["drug"],
            "agegroup": row["agegroup"],
            "freq": row["freq"],
            "match": 0,
            "rule": "no_match",
            "pubs_index": None,
        })
        continue

    # if we matched multiple things (e.g., combo), split frequency evenly across hits
    share = row["freq"] / max(1, len(found))
    for idx, rule in found:
        matches.append({
            "drug": row["drug"],
            "agegroup": row["agegroup"],
            "freq": share,
            "match": 1,
            "rule": rule,
            "pubs_index": idx,
        })

diag = pd.DataFrame(matches)

# keep only matched rows for the joined output, but save diagnostics for QA
diag.to_csv(out_diag, index=False)

joined = diag[diag["match"] == 1].merge(
    pubs.reset_index().rename(columns={"index":"pubs_index"}),
    on="pubs_index",
    how="left"
)

# select & tidy output columns
keep_cols = [
    # from pubs
    "cui","rxnorm_tty","rxnorm_name",
    "atc_code","atc_name",
    "L1_code","L1_name","L2_code","L2_name","L3_code","L3_name","L4_code","L4_name","L5_code","L5_name",
    "unique_pub_count",
    # from rx
    "drug","agegroup","freq","rule"
]
joined = joined[keep_cols].copy()

# sort for readability
joined = joined.sort_values(["L1_code","L2_code","L3_code","L4_code","rxnorm_name","agegroup"], na_position="last")

# save long-form output
joined.to_csv(out_long, index=False)

print(f"Saved long-form joined file: {out_long}")
print(f"Saved diagnostics: {out_diag}")

# quick sanity: totals by cui (all ages)
agg_by_cui = (
    joined.groupby(["cui","rxnorm_name","L1_code","L2_code","L3_code","L4_code"], dropna=False)["freq"]
    .sum()
    .reset_index()
    .sort_values("freq", ascending=False)
)
agg_by_cui.head(10)


  rxnorm_to_rows = pubs.groupby("rxnorm_norm").apply(lambda d: d.index.tolist()).to_dict()
  rxnorm_base_to_rows = pubs.groupby("rxnorm_base").apply(lambda d: d.index.tolist()).to_dict()


Saved long-form joined file: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/rx_vol_joined_to_umls.csv
Saved diagnostics: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/rx_vol_match_diagnostics.csv


Unnamed: 0,cui,rxnorm_name,L1_code,L2_code,L3_code,L4_code,freq
47,C0002645,amoxicillin,J,J01,J01C,J01CA,12552255.0
534,C0025810,methylphenidate,N,N06,N06B,N06BA,5103233.0
1210,C0298130,montelukast,R,R03,R03D,R03DC,3804640.0
938,C0060405,cefdinir,J,J01,J01D,J01DD,3124715.0
48,C0002658,amphetamine,N,N06,N06B,N06BA,2979830.0
24,C0001927,albuterol,R,R03,R03C,R03CC,2755833.75
23,C0001927,albuterol,R,R03,R03A,R03AC,2755833.75
1030,C0074393,sertraline,N,N06,N06A,N06AB,2593436.0
1441,C1873633,lisdexamfetamine,N,N06,N06B,N06BA,2491534.0
355,C0016365,fluoxetine,N,N06,N06A,N06AB,2184336.0


In [10]:
import pandas as pd
import re
from pathlib import Path

base = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform")
joined_path = base / "rx_vol_joined_to_umls.csv"          # from previous step
final_path  = base / "rx_vol_final_for_app.csv"

df = pd.read_csv(joined_path, dtype=str)
# freq was written as numeric; re-read as numeric to be safe
df["freq"] = pd.to_numeric(df["freq"], errors="coerce").fillna(0)

# ensure presence (fill missing cols if absent)
for col in [
    "cui","rxnorm_name","rxnorm_tty",
    "atc_code","atc_name",
    "L1_code","L1_name","L2_code","L2_name","L3_code","L3_name","L4_code","L4_name",
    "L5_code","L5_name",
    "unique_pub_count","agegroup","freq"
]:
    if col not in df.columns:
        df[col] = pd.NA

# --- smart proper-casing for display ---
ACRONYMS = {"HIV","TNF","NSAID","ACE","ARBs","XR","ER","IR","SR","CR"}  # extend as needed

def smart_title(s: str) -> str:
    if pd.isna(s) or not str(s).strip():
        return ""
    s0 = str(s).strip()

    # normalize separators (keep slashes & pluses as is)
    parts = re.split(r'([\/\+])', s0)  # keep delimiters

    def word_case(w: str) -> str:
        # keep all-caps acronyms
        if w.upper() in ACRONYMS:
            return w.upper()
        # keep numbers/symbols as-is
        if re.fullmatch(r"[0-9\-\.\(\)\[\]]+", w):
            return w
        # common RxNorm minuses: N-acetyl..., alpha-, beta-
        if "-" in w:
            subs = [word_case(x) for x in w.split("-")]
            return "-".join(subs)
        # default: title case with small fixes
        t = w.lower().capitalize()
        # special-cases you may want
        t = t.replace("Hcl", "HCl")
        t = t.replace("Na", "Na")  # sodium often uppercase as element symbol
        return t

    out = []
    for token in parts:
        if token in {"/","+"}:
            out.append(token)
        else:
            words = token.split()
            out.append(" ".join(word_case(w) for w in words if w))
    return "".join(out)

df["drug_display"] = df["rxnorm_name"].fillna("").map(smart_title)

# Make sure unique_pub_count is numeric-ish string or int for display
# (if it came in as str, keep as-is; otherwise you can coerce)
# df["unique_pub_count"] = pd.to_numeric(df["unique_pub_count"], errors="coerce").fillna(0).astype(int)

# final column order
final_cols = [
    "cui",
    "rxnorm_tty",
    "rxnorm_name",
    "drug_display",
    "agegroup",
    "freq",
    "unique_pub_count",
    "atc_code","atc_name",
    "L1_code","L1_name",
    "L2_code","L2_name",
    "L3_code","L3_name",
    "L4_code","L4_name",
    "L5_code","L5_name",
]

final_df = df[final_cols].copy()

# optional: sort for a nicer default order
final_df = final_df.sort_values(
    ["L1_code","L2_code","L3_code","L4_code","rxnorm_name","agegroup"],
    na_position="last"
)

final_df.to_csv(final_path, index=False)
final_df.head()
print(f"Saved final app-ready file: {final_path}")


Saved final app-ready file: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/rx_vol_final_for_app.csv


In [11]:
import pandas as pd
from pathlib import Path
import re

# ---- paths
base = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform")
pubs_path = base / "pedpubs_atc_merged.csv"
rx_path   = base / "rx_vol.csv"
out_long  = base / "rx_vol_joined_to_umls.csv"          # long form (cui, atc, agegroup, freq)
out_diag  = base / "rx_vol_match_diagnostics.csv"       # mapping audit

# ---- try fuzzy as last resort (optional)
try:
    from rapidfuzz import fuzz, process
    HAVE_FUZZ = True
except Exception:
    HAVE_FUZZ = False

# -----------------------------
# helpers
# -----------------------------
SALT_WORDS = {
    "hydrochloride","sulfate","sulphate","acetate","phosphate","nitrate","tartrate",
    "maleate","fumarate","mesylate","besylate","carbonate","bicarbonate","citrate",
    "oxalate","lactate","succinate","gluconate","hydrobromide","hydrobromid",
    "hydrochlorid","chloride","bitartrate","bismuth","sodium","potassium","magnesium",
    "calcium","zinc","aluminum","ammonium","di","tri","mono","anhydrous","dihydrate",
    "monohydrate","trihydrate","hydrate","micronized","topical","ophthalmic","oral",
    "extended","immediate","release","er","xr","sr","cr",
}

UK_US = {
    "aciclovir":"acyclovir",
    "adrenaline":"epinephrine",
    "noradrenaline":"norepinephrine",
    "oestradiol":"estradiol",
    "oestrogen":"estrogen",
    "methaemoglobin":"methemoglobin",
}

SPLIT_PAT = re.compile(r"[\/\+\&\,]| and ")   # split combos like "a/b", "a + b", "a & b", "a and b"

def norm(s: str) -> str:
    if pd.isna(s): return ""
    s = s.lower()
    # swap common UK→US spellings early
    for uk, us in UK_US.items():
        s = re.sub(rf"\b{re.escape(uk)}\b", us, s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def strip_salts(tokens):
    return [t for t in tokens if t and (t not in SALT_WORDS)]

def base_name(s: str) -> str:
    """remove salts/forms -> core ingredient words"""
    toks = norm(s).split()
    toks = strip_salts(toks)
    return " ".join(toks)

def ingredients(s: str):
    """split combos into ingredient bases"""
    parts = [p.strip() for p in SPLIT_PAT.split(str(s))]
    return [base_name(p) for p in parts if base_name(p)]

# -----------------------------
# load
# -----------------------------
pubs = pd.read_csv(pubs_path, dtype=str)
rx   = pd.read_csv(rx_path, dtype={"drug":str, "agegroup":str, "freq":float})

# clean pubs: drop rows without rxnorm_name
if "rxnorm_name" not in pubs.columns:
    raise ValueError("pedpubs_atc_merged.csv must have 'rxnorm_name'.")
pubs["rxnorm_name"] = pubs["rxnorm_name"].fillna("").astype(str)
pubs = pubs[pubs["rxnorm_name"].str.strip() != ""].copy()

# normalize reference (RxNorm) names
pubs["rxnorm_norm"] = pubs["rxnorm_name"].map(norm)
pubs["rxnorm_base"] = pubs["rxnorm_name"].map(base_name)

# build lookup sets
rxnorm_set = set(pubs["rxnorm_norm"].unique())
rxnorm_base_set = set(pubs["rxnorm_base"].unique())

# drug candidates by norm -> rows
rxnorm_to_rows = pubs.groupby("rxnorm_norm").apply(lambda d: d.index.tolist()).to_dict()
rxnorm_base_to_rows = pubs.groupby("rxnorm_base").apply(lambda d: d.index.tolist()).to_dict()

# prep rx_vol
rx["drug"] = rx["drug"].fillna("").astype(str)
rx["agegroup"] = rx["agegroup"].fillna("").astype(str)
rx["freq"] = rx["freq"].fillna(0).astype(float)

rx["drug_norm"] = rx["drug"].map(norm)
rx["drug_base"] = rx["drug"].map(base_name)
rx["drug_ings"] = rx["drug"].map(ingredients)   # list of ingredient base names

# -----------------------------
# matching routine
# -----------------------------
def match_one(drug_row) -> list[tuple[int, str]]:
    """
    Return list of (pubs_index, rule) matches.
    Strategy order:
      1) exact full norm
      2) exact base (salt stripped)
      3) any ingredient matches rxnorm_base
      4) fuzzy on base (>=92) if rapidfuzz present
    We allow multiple matches (e.g., combos). Caller can fan-out the freq.
    """
    dn  = drug_row["drug_norm"]
    db  = drug_row["drug_base"]
    ings = drug_row["drug_ings"]

    # 1) exact full norm
    if dn in rxnorm_to_rows:
        return [(idx, "exact_full") for idx in rxnorm_to_rows[dn]]

    # 2) exact base
    if db in rxnorm_base_to_rows:
        return [(idx, "exact_base") for idx in rxnorm_base_to_rows[db]]

    # 3) any ingredient base
    hits = []
    for ing in ings:
        if ing in rxnorm_base_to_rows:
            hits.extend([(idx, f"ingredient:{ing}") for idx in rxnorm_base_to_rows[ing]])
    if hits:
        return hits

    # 4) fuzzy fallback on base
    if HAVE_FUZZ and rxnorm_base_set:
        # top 1 fuzzy candidate
        best, score, _ = process.extractOne(db, rxnorm_base_set, scorer=fuzz.token_set_ratio)
        if score >= 92 and best in rxnorm_base_to_rows:
            return [(idx, f"fuzzy_base:{score}") for idx in rxnorm_base_to_rows[best]]

    return []  # no match

# -----------------------------
# perform matching (fan-out combos)
# -----------------------------
matches = []
for i, row in rx.iterrows():
    found = match_one(row)
    if not found:
        matches.append({
            "drug": row["drug"],
            "agegroup": row["agegroup"],
            "freq": row["freq"],
            "match": 0,
            "rule": "no_match",
            "pubs_index": None,
        })
        continue

    # if we matched multiple things (e.g., combo), split frequency evenly across hits
    share = row["freq"] / max(1, len(found))
    for idx, rule in found:
        matches.append({
            "drug": row["drug"],
            "agegroup": row["agegroup"],
            "freq": share,
            "match": 1,
            "rule": rule,
            "pubs_index": idx,
        })

diag = pd.DataFrame(matches)

# keep only matched rows for the joined output, but save diagnostics for QA
diag.to_csv(out_diag, index=False)

joined = diag[diag["match"] == 1].merge(
    pubs.reset_index().rename(columns={"index":"pubs_index"}),
    on="pubs_index",
    how="left"
)

# select & tidy output columns
keep_cols = [
    # from pubs
    "cui","rxnorm_tty","rxnorm_name",
    "atc_code","atc_name",
    "L1_code","L1_name","L2_code","L2_name","L3_code","L3_name","L4_code","L4_name","L5_code","L5_name",
    "unique_pub_count",
    # from rx
    "drug","agegroup","freq","rule"
]
joined = joined[keep_cols].copy()

# sort for readability
joined = joined.sort_values(["L1_code","L2_code","L3_code","L4_code","rxnorm_name","agegroup"], na_position="last")

# save long-form output
joined.to_csv(out_long, index=False)

print(f"Saved long-form joined file: {out_long}")
print(f"Saved diagnostics: {out_diag}")

# quick sanity: totals by cui (all ages)
agg_by_cui = (
    joined.groupby(["cui","rxnorm_name","L1_code","L2_code","L3_code","L4_code"], dropna=False)["freq"]
    .sum()
    .reset_index()
    .sort_values("freq", ascending=False)
)
agg_by_cui.head(10)


  rxnorm_to_rows = pubs.groupby("rxnorm_norm").apply(lambda d: d.index.tolist()).to_dict()
  rxnorm_base_to_rows = pubs.groupby("rxnorm_base").apply(lambda d: d.index.tolist()).to_dict()


Saved long-form joined file: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/rx_vol_joined_to_umls.csv
Saved diagnostics: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/rx_vol_match_diagnostics.csv


Unnamed: 0,cui,rxnorm_name,L1_code,L2_code,L3_code,L4_code,freq
47,C0002645,amoxicillin,J,J01,J01C,J01CA,12552255.0
534,C0025810,methylphenidate,N,N06,N06B,N06BA,5103233.0
1210,C0298130,montelukast,R,R03,R03D,R03DC,3804640.0
938,C0060405,cefdinir,J,J01,J01D,J01DD,3124715.0
48,C0002658,amphetamine,N,N06,N06B,N06BA,2979830.0
24,C0001927,albuterol,R,R03,R03C,R03CC,2755833.75
23,C0001927,albuterol,R,R03,R03A,R03AC,2755833.75
1030,C0074393,sertraline,N,N06,N06A,N06AB,2593436.0
1441,C1873633,lisdexamfetamine,N,N06,N06B,N06BA,2491534.0
355,C0016365,fluoxetine,N,N06,N06A,N06AB,2184336.0


In [14]:
# df = pubs  # or agg_by_cui, whichever you want to validate
df = df.copy()

# 1) drop rows with empty rxnorm_name
mask_good = df["rxnorm_name"].notna() & (df["rxnorm_name"].str.strip() != "")
pubs_clean = df.loc[mask_good].copy()
print(f"Remaining rows after dropping empty drug names: {len(pubs_clean):,}")

# 2) coverage by CUI
total_cuis   = df["cui"].nunique()
matched_cuis = pubs_clean["cui"].nunique()
print(f"Matched CUIs with names: {matched_cuis}/{total_cuis} ({matched_cuis/total_cuis:.1%})")

# 3) unmatched list (if you want it)
unmatched = (df.loc[~mask_good, ["cui", "rxnorm_tty"]]
               .drop_duplicates()
               .sort_values("cui"))
print("\nUnmatched CUIs (no rxnorm_name):", len(unmatched))
# unmatched.to_csv("unmatched_cuis.csv", index=False)

# 4) quick sanity totals – only if column exists
def safe_sum(frame, col):
    if col in frame.columns:
        return pd.to_numeric(frame[col], errors="coerce").fillna(0).sum()
    return None

raw_rx   = safe_sum(df, "freq")
clean_rx = safe_sum(pubs_clean, "freq")

if raw_rx is not None:
    print("\nTotal Rx volume (raw):    ", f"{raw_rx:,.0f}")
    print("Total Rx volume (cleaned):", f"{clean_rx:,.0f}")
else:
    print("\nNo 'freq' column in this dataframe. Columns:", df.columns.tolist())

# Optional: if you’re validating the pubs table instead, you can sanity-check pubs
raw_pubs   = safe_sum(df, "unique_pub_count")
clean_pubs = safe_sum(pubs_clean, "unique_pub_count")
if raw_pubs is not None:
    print("Total publications (raw):    ", f"{raw_pubs:,.0f}")
    print("Total publications (cleaned):", f"{clean_pubs:,.0f}")


Remaining rows after dropping empty drug names: 2,945
Matched CUIs with names: 2353/2353 (100.0%)

Unmatched CUIs (no rxnorm_name): 0

No 'freq' column in this dataframe. Columns: ['cui', 'rxnorm_tty', 'rxnorm_name', 'atc_code', 'atc_name', 'L1_code', 'L1_name', 'L2_code', 'L2_name', 'L3_code', 'L3_name', 'L4_code', 'L4_name', 'L5_code', 'L5_name', 'unique_pub_count', 'rxnorm_norm', 'rxnorm_base']
Total publications (raw):     1,101,214
Total publications (cleaned): 1,101,214
