
# Attach UMLS CUIs to Raw Drug Lists (MarketScan & FAERS)

This notebook:
- Loads **raw MarketScan** and **FAERS** drug lists
- Loads standardized per-source maps (raw string → `UMLS_CUI`, `Preferred_Term`)
- Normalizes raw strings for robust joins (lowercase, strip, collapse spaces)
- Left-joins CUIs onto the raw files
- Writes outputs:
  - `ms_with_cui.csv`
  - `faers_with_cui.csv`
  - `unmatched_ms.csv` / `unmatched_faers.csv`
  - `qc_attach_summary.csv` (match counts & rates)

> **Note:** You do **not** need the UMLS `META` folder for this step. It's only needed later for ingredient roll-ups or synonym expansion.


In [None]:

from pathlib import Path

# ------------------------------
# CONFIG — edit paths/columns as needed
# ------------------------------
MS_RAW   = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/ms_druglist.csv")
FAERS_RAW= Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/faers_druglist.csv")

MAP_DIR  = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz")
MS_MAP_FN    = MAP_DIR / "marketscan_standardized.csv"
FAERS_MAP_FN = MAP_DIR / "faers_standardized.csv"
FINAL_JOINED = MAP_DIR / "final_joined.csv"  # not required here
FUZZY_REVIEW = MAP_DIR / "fuzzy_review_candidates.csv"  # optional

# Raw drug string column names (change if your files use different headers)
MS_RAW_DRUG_COL    = "drug_name"     # e.g., MarketScan raw column with original string
FAERS_RAW_DRUG_COL = "drugname"      # e.g., FAERS DRUG file 'drugname'

# Standardized map drug column name (as produced earlier)
MAP_DRUG_COL = "Drug"                # in *_standardized.csv

# Output directory
OUT_DIR = MAP_DIR  # write outputs alongside your crosswalk files
OUT_DIR


In [None]:

import pandas as pd

def normalize(s: pd.Series) -> pd.Series:
    """Lower/strip/collapse spaces; safe on NA."""
    return (
        s.fillna("")
         .astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
    )

def attach_cui(raw_df, raw_drug_col, map_df, map_drug_col):
    """Attach UMLS_CUI/Preferred_Term by normalized string join."""
    left  = raw_df.copy()
    right = map_df.copy()

    # sanity checks
    if raw_drug_col not in left.columns:
        raise ValueError(f"Column '{raw_drug_col}' not in raw_df: {list(left.columns)}")
    if map_drug_col not in right.columns:
        raise ValueError(f"Column '{map_drug_col}' not in map_df: {list(right.columns)}")

    left["_join_key"]  = normalize(left[raw_drug_col])
    right["_join_key"] = normalize(right[map_drug_col])

    # One row per join_key in map, prefer rows with a CUI present
    right = (right.sort_values(by=["UMLS_CUI"], na_position="last")
                  .drop_duplicates(subset=["_join_key"], keep="first"))

    merged = left.merge(
        right.drop(columns=[map_drug_col]),  # avoid duplicate original-string column
        on="_join_key",
        how="left",
        validate="m:1"
    ).drop(columns=["_join_key"])

    return merged

def summarize_qc(df, cui_col="UMLS_CUI"):
    total = len(df)
    matched = df[cui_col].notna().sum() if cui_col in df.columns else 0
    return pd.DataFrame([{
        "rows": total,
        "matched": matched,
        "unmatched": total - matched,
        "match_rate": round(matched / total, 4) if total else 0.0
    }])


In [None]:

# Load inputs
ms_raw    = pd.read_csv(MS_RAW, dtype=str, low_memory=False)
faers_raw = pd.read_csv(FAERS_RAW, dtype=str, low_memory=False)
ms_map    = pd.read_csv(MS_MAP_FN, dtype=str, low_memory=False)
faers_map = pd.read_csv(FAERS_MAP_FN, dtype=str, low_memory=False)

(ms_raw.shape, faers_raw.shape, ms_map.shape, faers_map.shape)


In [None]:

# Attach CUIs
ms_with    = attach_cui(ms_raw, MS_RAW_DRUG_COL,    ms_map,    MAP_DRUG_COL)
faers_with = attach_cui(faers_raw, FAERS_RAW_DRUG_COL, faers_map, MAP_DRUG_COL)

ms_with.head(), faers_with.head()


In [None]:

# Write outputs
OUT_DIR.mkdir(parents=True, exist_ok=True)
ms_out_fn    = OUT_DIR / "ms_with_cui.csv"
faers_out_fn = OUT_DIR / "faers_with_cui.csv"

ms_with.to_csv(ms_out_fn, index=False)
faers_with.to_csv(faers_out_fn, index=False)

ms_out_fn, faers_out_fn


In [None]:

# QC summary + unmatched dumps
qc_ms = summarize_qc(ms_with).assign(source="MarketScan")
qc_fa = summarize_qc(faers_with).assign(source="FAERS")
qc = pd.concat([qc_ms, qc_fa], ignore_index=True)[["source","rows","matched","unmatched","match_rate"]]

qc_fn = OUT_DIR / "qc_attach_summary.csv"
qc.to_csv(qc_fn, index=False)

um_ms_fn = OUT_DIR / "unmatched_ms.csv"
um_fa_fn = OUT_DIR / "unmatched_faers.csv"
ms_with[ms_with.get("UMLS_CUI").isna()].to_csv(um_ms_fn, index=False)
faers_with[faers_with.get("UMLS_CUI").isna()].to_csv(um_fa_fn, index=False)

qc, qc_fn, um_ms_fn, um_fa_fn


In [None]:

# Quick peek at first few unmatched from each source (if any)
display(ms_with[ms_with.get("UMLS_CUI").isna()].head(10))
display(faers_with[faers_with.get("UMLS_CUI").isna()].head(10))
