# Drug Standardization Pipeline (MarketScan • FAERS • MPRINT → UMLS/RxNorm)


This notebook normalizes drug names across MarketScan, FAERS, and MPRINT using **UMLS/RxNorm CUIs** as the canonical identifier.
It supports exact and fuzzy matching, handles salts/esters, and preserves audit trails for review.

**Outputs (written to `./drug_standardization_outputs/`)**
- Per-source standardized tables: `{source}_standardized.csv`
- Review queue: `fuzzy_review_candidates.csv`
- Presence-join across sources: `final_joined.csv`
- QC table: `qc_summary.csv`


## 1) Configuration

In [2]:

# --- Required paths ---
from pathlib import Path

# UMLS META directory (contains MRCONSO.RRF, MRSTY.RRF, etc.)
UMLS_META_DIR = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/umls/2025AA-full/output/2025AA/META/")

# Source inputs
MS_PATH     = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/ms_druglist.csv")
FAERS_PATH  = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/faers_druglist.csv")
MPRINT_PATH = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/mprint_druglist.txt")  # may already include CUIs

# Columns (auto-detect if None)
MS_DRUG_COL = None
FAERS_DRUG_COL = None
MPRINT_DRUG_COL = None
MPRINT_CUI_COL = "CUI"   # if MPRINT already has CUIs in a column named CUI

# Matching thresholds
FUZZY_STRONG = 90   # auto-accept
FUZZY_REVIEW = 80   # review queue for [80, 90)

# Output directory (relative to this notebook)
OUTPUT_DIR = Path("./drug_standardization_outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Config loaded.")


Config loaded.


## 2) Imports & helper functions

In [3]:

import re
import pandas as pd
from typing import Optional, Tuple, List, Dict

# If rapidfuzz isn't installed: pip install rapidfuzz
try:
    from rapidfuzz import process, fuzz
except Exception as e:
    raise RuntimeError("Please install rapidfuzz in your environment: pip install rapidfuzz") from e

def _clean_str(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).lower().strip()
    noise = [
        r"\btablet(s)?\b", r"\btab(s)?\b", r"\bcapsule(s)?\b", r"\bcap(s)?\b",
        r"\binjection\b", r"\binj\b", r"\bsolution\b", r"\bsuspension\b",
        r"\bsyrup\b", r"\bcream\b", r"\bointment\b", r"\bspray\b", r"\bdrops?\b",
        r"\ber\b", r"\bir\b", r"\bxr\b", r"\bdr\b"
    ]
    for pat in noise:
        s = re.sub(pat, " ", s)
    s = re.sub(r"[^\w\s/\+\-]", " ", s)  # keep / + -
    s = re.sub(r"\s+", " ", s).strip()
    return s

SALT_TERMS = [
    "hcl","hydrochloride","sodium","potassium","sulfate","sulphate","phosphate",
    "acetate","tartrate","mesylate","oxalate","nitrate","succinate","fumarate",
    "bitartrate","bromide","magnesium","calcium"
]

def _strip_salts(s: str) -> str:
    return " ".join(t for t in s.split() if t not in SALT_TERMS)

def _split_combos(s: str) -> List[str]:
    parts = re.split(r"\s*[+/,&]\s*|\s+with\s+", s)
    parts = [p.strip() for p in parts if p.strip()]
    return parts

def auto_guess_drug_col(df: pd.DataFrame) -> Optional[str]:
    candidates = ["drug","drug_name","medication","ingredient","name","generic_name","brand_name"]
    for c in df.columns:
        lc = c.lower()
        if lc in candidates or any(k in lc for k in ["drug","name","med","ing"]):
            return c
    first = df.columns[0]
    if pd.api.types.is_string_dtype(df[first]):
        return first
    return None

def read_any_table(path: Path) -> pd.DataFrame:
    # Try CSV → TSV → pipe
    for sep in [",", "\t", "|"]:
        try:
            df = pd.read_csv(path, sep=None if sep == "," else sep, engine="python")
            return df
        except Exception:
            continue
    raise RuntimeError(f"Could not read file: {path}")

def load_source(path: Path, drug_col: Optional[str]) -> pd.DataFrame:
    df = read_any_table(path)
    if drug_col is None:
        drug_col = auto_guess_drug_col(df)
    if drug_col is None:
        raise ValueError(f"Could not auto-detect drug column in {path.name}. Please set the *_DRUG_COL config.")
    df = df.rename(columns={drug_col: "raw_drug"})
    df = df[["raw_drug"]].dropna().drop_duplicates().reset_index(drop=True)
    df["clean_drug"] = df["raw_drug"].apply(_clean_str)
    df["base_drug"]  = df["clean_drug"].apply(_strip_salts)
    return df


In [4]:

def load_rxnorm_from_rrf(meta_dir: Path, chunksize: int = 2_000_000) -> pd.DataFrame:
    '''Read MRCONSO.RRF and return RXNORM rows with key fields.
    MRCONSO columns (UMLS 2025AA):
      0:CUI, 11:SAB, 12:TTY, 14:STR
    '''
    mrconso = meta_dir / "MRCONSO.RRF"   # matches your folder (uppercase)
    usecols = [0, 11, 12, 14]
    names   = ["CUI","SAB","TTY","STR"]
    dtype   = {"CUI":"string","SAB":"string","TTY":"string","STR":"string"}

    chunks = []
    for chunk in pd.read_csv(
        mrconso, sep="|", header=None, usecols=usecols, names=names,
        dtype=dtype, chunksize=chunksize, quoting=3, engine="python"
    ):
        rx = chunk[chunk["SAB"].str.upper() == "RXNORM"].copy()
        if not rx.empty:
            chunks.append(rx)

    rx = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame(columns=names)
    rx["clean_str"] = rx["STR"].apply(_clean_str)
    rx["base_str"]  = rx["clean_str"].apply(_strip_salts)
    return rx

def pick_preferred_term(group: pd.DataFrame) -> pd.Series:
    order = {"IN":1, "PIN":2, "SCD":3, "SBD":4, "GPCK":5, "BPCK":6}
    group["_prio"] = group["TTY"].map(order).fillna(9)
    best = group.sort_values(["_prio","STR"]).iloc[0]
    return pd.Series({"CUI":best["CUI"], "Preferred_STR":best["STR"], "Preferred_TTY":best["TTY"]})

def exact_match(source_df: pd.DataFrame, rx: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    merged1 = source_df.merge(rx[["CUI","TTY","STR","clean_str","base_str"]],
                              left_on="clean_drug", right_on="clean_str", how="left")
    need2   = merged1[merged1["CUI"].isna()].copy()
    fill2   = need2.merge(rx[["CUI","TTY","STR","clean_str","base_str"]],
                          left_on="base_drug", right_on="base_str", how="left", suffixes=("","_b"))
    exact      = pd.concat([merged1[~merged1["CUI"].isna()], fill2[~fill2["CUI"].isna()]], ignore_index=True)
    unmatched  = pd.concat([merged1[merged1["CUI"].isna()], fill2[fill2["CUI"].isna()]], ignore_index=True)
    exact      = exact.drop_duplicates(subset=["raw_drug","CUI"])
    return exact, unmatched[["raw_drug","clean_drug","base_drug"]].drop_duplicates()

def build_string_index(rx: pd.DataFrame) -> Tuple[List[str], Dict[str, List[Tuple[str,str]]]]:
    str_to_rows = {}
    candidates = set(rx["clean_str"]).union(set(rx["base_str"]))
    candidates = sorted([c for c in candidates if isinstance(c, str)])
    for _, r in rx.iterrows():
        for key in [r["clean_str"], r["base_str"]]:
            if not isinstance(key, str):
                continue
            str_to_rows.setdefault(key, []).append((r["CUI"], r["STR"], r["TTY"]))
    return candidates, str_to_rows

def fuzzy_match(unmatched_df: pd.DataFrame, rx: pd.DataFrame, strong: int=90, review: int=80) -> Tuple[pd.DataFrame, pd.DataFrame]:
    candidates, str_to_rows = build_string_index(rx)
    results, reviews = [], []
    for _, row in unmatched_df.iterrows():
        q = row["clean_drug"]
        if not q:
            continue
        match = process.extractOne(q, candidates, scorer=fuzz.WRatio)
        if match is None:
            continue
        match_str, score, _ = match
        rows = str_to_rows.get(match_str, [])
        df = pd.DataFrame(rows, columns=["CUI","STR","TTY"])
        if df.empty:
            continue
        chosen = pick_preferred_term(df)
        out = {
            "raw_drug": row["raw_drug"],
            "query": q,
            "matched_str": match_str,
            "score": score,
            "CUI": chosen["CUI"],
            "Preferred_STR": chosen["Preferred_STR"],
            "Preferred_TTY": chosen["Preferred_TTY"]
        }
        if score >= strong:
            results.append(out)
        else:
            reviews.append(out)
    return pd.DataFrame(results), pd.DataFrame(reviews)

def consolidate_matches(exact_df: pd.DataFrame, fuzzy_df: pd.DataFrame) -> pd.DataFrame:
    exact_norm = exact_df.rename(columns={"STR":"Preferred_STR","TTY":"Preferred_TTY"})
    exact_norm = exact_norm[["raw_drug","CUI","Preferred_STR","Preferred_TTY"]].drop_duplicates()
    if not fuzzy_df.empty:
        fuzzy_pick = fuzzy_df.sort_values(["raw_drug","score"], ascending=[True, False]).drop_duplicates("raw_drug")
        fuzzy_pick = fuzzy_pick[["raw_drug","CUI","Preferred_STR","Preferred_TTY"]]
    else:
        fuzzy_pick = pd.DataFrame(columns=["raw_drug","CUI","Preferred_STR","Preferred_TTY"])
    combined = pd.concat([exact_norm, fuzzy_pick], ignore_index=True)
    combined = combined.drop_duplicates("raw_drug", keep="first")
    return combined


## 3) Load RxNorm / UMLS reference (MRCONSO.RRF)

In [5]:

rx = load_rxnorm_from_rrf(UMLS_META_DIR)
print(f"RxNorm terms loaded: {len(rx):,}")
rx.head(3)


RxNorm terms loaded: 354,029


Unnamed: 0,CUI,SAB,TTY,STR,clean_str,base_str
0,C0000039,RXNORM,IN,"1,2-dipalmitoylphosphatidylcholine",1 2-dipalmitoylphosphatidylcholine,1 2-dipalmitoylphosphatidylcholine
1,C0000266,RXNORM,BN,Parlodel,parlodel,parlodel
2,C0000294,RXNORM,IN,mesna,mesna,mesna


## 4) Load sources & clean

In [6]:

ms = load_source(MS_PATH, MS_DRUG_COL)
print("MarketScan unique drugs:", len(ms))

faers = load_source(FAERS_PATH, FAERS_DRUG_COL)
print("FAERS unique drugs:", len(faers))

mprint = load_source(MPRINT_PATH, MPRINT_DRUG_COL)  # if MPRINT has CUIs, we can align later
print("MPRINT unique drugs:", len(mprint))

ms.head(10)


MarketScan unique drugs: 2584
FAERS unique drugs: 17650
MPRINT unique drugs: 6588


Unnamed: 0,raw_drug,clean_drug,base_drug
0,"1,1,1,3,3-Pentafluoropropane/Norflurane",1 1 1 3 3-pentafluoropropane/norflurane,1 1 1 3 3-pentafluoropropane/norflurane
1,5-Methyltetrahydrofolate Calcium,5-methyltetrahydrofolate calcium,5-methyltetrahydrofolate
2,5-Methyltetrahydrofolic Acid,5-methyltetrahydrofolic acid,5-methyltetrahydrofolic acid
3,5-Methyltetrahydrofolic Acid/Glucosamine HCl,5-methyltetrahydrofolic acid/glucosamine hcl,5-methyltetrahydrofolic acid/glucosamine
4,Abacavir Sulfate,abacavir sulfate,abacavir
5,Abacavir Sulfate/Lamivudine,abacavir sulfate/lamivudine,abacavir sulfate/lamivudine
6,Abacavir Sulfate/Lamivudine/Zidovudine,abacavir sulfate/lamivudine/zidovudine,abacavir sulfate/lamivudine/zidovudine
7,Abacavir/Dolutegravir/Lamivudine,abacavir/dolutegravir/lamivudine,abacavir/dolutegravir/lamivudine
8,Abatacept,abatacept,abatacept
9,Abemaciclib,abemaciclib,abemaciclib


## 5) Match each source to UMLS/RxNorm

In [None]:

def match_source_to_rxnorm(src_df: pd.DataFrame, source_name: str):
    exact, unmatched = exact_match(src_df, rx)
    fuzzy, review = fuzzy_match(unmatched, rx, strong=FUZZY_STRONG, review=FUZZY_REVIEW)
    combined = consolidate_matches(exact, fuzzy)
    combined["source"] = source_name
    return combined, review

ms_map, ms_review         = match_source_to_rxnorm(ms, "marketscan")
faers_map, faers_review   = match_source_to_rxnorm(faers, "faers")
mprint_map, mprint_review = match_source_to_rxnorm(mprint, "mprint")

print("MarketScan mapped:", len(ms_map),  "| review queue:", len(ms_review))
print("FAERS mapped:",      len(faers_map), "| review queue:", len(faers_review))
print("MPRINT mapped:",     len(mprint_map), "| review queue:", len(mprint_review))


## 6) Save per-source standardized outputs

In [9]:

def save_standardized(src_df: pd.DataFrame, src_map: pd.DataFrame, name: str) -> pd.DataFrame:
    out = src_df.merge(src_map.drop(columns=["source"]), on="raw_drug", how="left")
    out = out.rename(columns={
        "raw_drug": "Drug",
        "CUI": "UMLS_CUI",
        "Preferred_STR": "Preferred_Term",
        "Preferred_TTY": "Preferred_TTY"
    })
    out.to_csv(OUTPUT_DIR / f"{name}_standardized.csv", index=False)
    return out

ms_out     = save_standardized(ms, ms_map, "marketscan")
faers_out  = save_standardized(faers, faers_map, "faers")
mprint_out = save_standardized(mprint, mprint_map, "mprint")

# Consolidate review queues
review_all = pd.concat([ms_review, faers_review, mprint_review], ignore_index=True)
if not review_all.empty:
    review_all.to_csv(OUTPUT_DIR / "fuzzy_review_candidates.csv", index=False)

print("Saved:", [p.name for p in OUTPUT_DIR.glob("*.csv")])


Saved: ['mprint_standardized.csv', 'fuzzy_review_candidates.csv', 'marketscan_standardized.csv', 'faers_standardized.csv']


## 7) Join across sources on CUI

In [1]:

final = (
    ms_out[["UMLS_CUI","Preferred_Term"]]
    .merge(faers_out[["UMLS_CUI"]].assign(FAERS_flag=1), on="UMLS_CUI", how="outer")
    .merge(mprint_out[["UMLS_CUI"]].assign(MPRINT_flag=1), on="UMLS_CUI", how="outer")
    .drop_duplicates()
    .fillna({"FAERS_flag":0, "MPRINT_flag":0})
)
final.to_csv(OUTPUT_DIR / "final_joined.csv", index=False)
final.head(10)


NameError: name 'ms_out' is not defined

## 8) QC summary

In [None]:

def qc_report(mapped: pd.DataFrame, source_label: str) -> pd.DataFrame:
    total = len(mapped)
    matched = mapped["UMLS_CUI"].notna().sum()
    return pd.DataFrame({
        "source":[source_label],
        "total_unique_drugs":[total],
        "matched":[matched],
        "match_rate":[round(100*matched/total,2) if total else 0.0]
    })

qc = pd.concat([
    qc_report(ms_out, "marketscan"),
    qc_report(faers_out, "faers"),
    qc_report(mprint_out, "mprint"),
], ignore_index=True)
qc.to_csv(OUTPUT_DIR / "qc_summary.csv", index=False)
qc


## 9) (Optional) Apply manual overrides

In [None]:

# If you curate 'manual_overrides.csv' with columns: raw_drug, CUI
OVERRIDES = OUTPUT_DIR / "manual_overrides.csv"
if OVERRIDES.exists():
    ov = pd.read_csv(OVERRIDES, dtype=str)

    def apply_overrides(out_df: pd.DataFrame, name: str):
        if ov.empty:
            return out_df
        out2 = out_df.merge(ov, left_on="Drug", right_on="raw_drug", how="left", suffixes=("","_OVR"))
        out2["UMLS_CUI"] = out2["CUI"].fillna(out2["UMLS_CUI"])
        out2 = out2.drop(columns=["CUI","raw_drug"])
        out2.to_csv(OUTPUT_DIR / f"{name}_standardized_overridden.csv", index=False)
        print(f"Applied overrides → {name}_standardized_overridden.csv")
        return out2

    ms_out     = apply_overrides(ms_out, "marketscan")
    faers_out  = apply_overrides(faers_out, "faers")
    mprint_out = apply_overrides(mprint_out, "mprint")
else:
    print("No overrides found. To use, create:", OVERRIDES)
