# Build Unified Drug × Age × ADE Table (PRR, Prescriptions, Publications, ATC)

This notebook creates a single analyzable table with columns like:

- `UMLS_CUI`, `Preferred_Term` (drug), `ATC_Code`, `ATC_Name` *(optional)*  
- `age_group`, `ADE` (e.g., MedDRA PT), `PRR`  
- `prescriptions` (MarketScan), `publication_count` (MPRINT)

**Inputs**
- FAERS drug–ADE pairs with PRR and age group (case-level not required): `faers_ade_by_drug_age.csv` (you can point to yours)
- MarketScan drug rows with CUIs attached: `ms_with_cui.csv` (from previous step)
- MPRINT publications with CUI or a list of CUIs: `mprint_standardized.csv` or `mprint_druglist.txt`
- Crosswalk: `final_joined.csv`
- Optional UMLS META (`MRCONSO.RRF`, `MRREL.RRF`) for ATC mapping

> If META files are not provided, ATC columns are skipped (the rest still works).


## 1) Configure paths
Edit these if your files live elsewhere.


In [4]:
from pathlib import Path

# Core crosswalk directory
CROSS = Path("/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz")

# Prior outputs
MS_WITH_CUI    = CROSS / "ms_with_cui.csv"        # from Attach_CUIs_Auto.ipynb
FINAL_JOINED   = CROSS / "final_joined.csv"
MPRINT_STD     = CROSS / "mprint_standardized.csv"  # or a pubs table with a CUI column
MPRINT_LIST    = CROSS / "mprint_druglist.txt"      # optional: plain list with 'cui' column

# FAERS ADE-by-drug file (your source of PRR + age group), example name:
FAERS_ADE      = CROSS / "faers_ade_by_drug_age.csv"   # <-- EDIT to your file

# FAERS standardized map (raw drug string -> UMLS_CUI, Preferred_Term)
FAERS_MAP      = CROSS / "faers_standardized.csv"

# Optional UMLS META for ATC mapping (only if you want ATC)
UMLS_META_DIR  = Path("/path/to/UMLS/META")  # e.g., "/Users/you/umls/2024AB/META"
MRCONSO_RRF    = UMLS_META_DIR / "MRCONSO.RRF"
MRREL_RRF      = UMLS_META_DIR / "MRREL.RRF"

# Output
OUT_UNIFIED    = CROSS / "drug_age_ade_metrics.csv"

# --- Sandbox fallbacks (ignored on your machine) ---
if not MS_WITH_CUI.exists():
    MS_WITH_CUI = Path("/mnt/data/ms_with_cui.csv")
if not FINAL_JOINED.exists():
    FINAL_JOINED = Path("/mnt/data/final_joined.csv")
if not FAERS_MAP.exists():
    FAERS_MAP = Path("/mnt/data/faers_standardized.csv")
# mprint fallbacks left as is; FAERS_ADE must be set by user if not present.

MS_WITH_CUI, FINAL_JOINED, FAERS_MAP, MPRINT_STD, MPRINT_LIST, FAERS_ADE, OUT_UNIFIED


(PosixPath('/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/ms_with_cui.csv'),
 PosixPath('/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/final_joined.csv'),
 PosixPath('/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/faers_standardized.csv'),
 PosixPath('/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/mprint_standardized.csv'),
 PosixPath('/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/mprint_druglist.txt'),
 PosixPath('/Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/faers_ade_by_drug_age.csv'),
 PosixPath('/Users/rahurkar.

## 2) Helpers
- Normalization for string joins
- Auto-detect column names for FAERS ADE files (drug, ADE, PRR, age)
- Optional ATC mapping from UMLS META


In [2]:
import pandas as pd
import numpy as np

def normalize(s: pd.Series) -> pd.Series:
    return (
        s.fillna("")
         .astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
    )

# Column candidates
DRUG_COL_CANDS = ["drugname", "drug", "prod_ai", "gname", "name"]
ADE_COL_CANDS  = ["pt", "event", "meddra_pt", "reactionmeddrapt", "pt_name"]
PRR_COL_CANDS  = ["prr", "prr_value"]
AGE_COL_CANDS  = ["age_group", "agegroup", "age_cat", "age_category"]

CUI_COL_CANDS  = ["UMLS_CUI", "CUI", "cui"]
PT_COL_CANDS   = ["Preferred_Term", "preferred_term", "drug_pt", "rxnorm_name"]

def find_col(df: pd.DataFrame, candidates):
    lower = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c.lower() in lower:
            return lower[c.lower()]
    return None

def coerce_cui_col(df: pd.DataFrame) -> pd.DataFrame:
    for c in CUI_COL_CANDS:
        if c in df.columns:
            if c != "UMLS_CUI":
                df = df.rename(columns={c: "UMLS_CUI"})
            return df
    return df

def coerce_pt_col(df: pd.DataFrame) -> pd.DataFrame:
    for c in PT_COL_CANDS:
        if c in df.columns:
            if c != "Preferred_Term":
                df = df.rename(columns={c: "Preferred_Term"})
            return df
    return df

def attach_cui_to_faers_ade(faers_ade_df, faers_map_df, faers_drug_col="drugname", map_drug_col="Drug"):
    # normalized join on drug string
    left = faers_ade_df.copy()
    right = faers_map_df.copy()

    if faers_drug_col not in left.columns:
        cand = find_col(left, DRUG_COL_CANDS)
        if not cand:
            raise ValueError(f"FAERS ADE file lacks a recognizable drug column. Available: {list(left.columns)}")
        faers_drug_col = cand

    if map_drug_col not in right.columns:
        raise ValueError(f"FAERS map missing '{map_drug_col}'. Columns: {list(right.columns)}")

    left["_join_key"]  = normalize(left[faers_drug_col])
    right["_join_key"] = normalize(right[map_drug_col])

    right = (right.sort_values(by=["UMLS_CUI"], na_position="last")
                  .drop_duplicates(subset=["_join_key"], keep="first"))

    merged = (left.merge(right.drop(columns=[map_drug_col]), on="_join_key", how="left", validate="m:1")
                  .drop(columns=["_join_key"]))
    return merged

def load_mprint_counts(mprint_std_path: Path, mprint_list_path: Path):
    # Count publications per CUI from either a table with a CUI column or a list with 'cui'
    if mprint_std_path.exists():
        try:
            df = pd.read_csv(mprint_std_path, dtype=str, low_memory=False)
            df = coerce_cui_col(df)
            if "UMLS_CUI" in df.columns:
                return (df["UMLS_CUI"].dropna()
                               .value_counts()
                               .rename_axis("UMLS_CUI")
                               .rename("publication_count")
                               .reset_index())
        except Exception:
            pass

    if mprint_list_path.exists():
        try:
            df = pd.read_csv(mprint_list_path, sep="\t", dtype=str, low_memory=False)
            if "cui" in df.columns and "UMLS_CUI" not in df.columns:
                df = df.rename(columns={"cui":"UMLS_CUI"})
            df = coerce_cui_col(df)
            if "UMLS_CUI" in df.columns:
                return (df["UMLS_CUI"].dropna()
                               .value_counts()
                               .rename_axis("UMLS_CUI")
                               .rename("publication_count")
                               .reset_index())
        except Exception:
            pass
    return pd.DataFrame({"UMLS_CUI": pd.Series(dtype="string"), "publication_count": pd.Series(dtype="Int64")})

def build_atc_map_from_umls(mrconso_path: Path, mrrel_path: Path):
    if not (mrconso_path.exists() and mrrel_path.exists()):
        return pd.DataFrame(columns=["UMLS_CUI","ATC_Code","ATC_Name"])

    cols = ["CUI","SAB","TTY","CODE","STR"]
    mrconso = pd.read_csv(mrconso_path, sep="|", header=None, dtype=str, low_memory=False, usecols=[0,11,12,13,14])
    mrconso.columns = cols

    atc_terms = mrconso[mrconso["SAB"]=="ATC"][["CUI","CODE","STR"]].drop_duplicates()
    atc_terms = atc_terms.rename(columns={"CUI":"ATC_CUI","CODE":"ATC_Code","STR":"ATC_Name"})

    rel = pd.read_csv(mrrel_path, sep="|", header=None, dtype=str, low_memory=False, usecols=[0,3,4,7])
    rel.columns = ["CUI1","REL","CUI2","RELA"]

    rel1 = rel.merge(atc_terms.rename(columns={"ATC_CUI":"CUI2"}), on="CUI2", how="inner")  # CUI2 is ATC
    rel1 = rel1.rename(columns={"CUI1":"UMLS_CUI"})
    rel2 = rel.merge(atc_terms.rename(columns={"ATC_CUI":"CUI1"}), on="CUI1", how="inner")  # CUI1 is ATC
    rel2 = rel2.rename(columns={"CUI2":"UMLS_CUI"})

    atc_map = pd.concat([rel1[["UMLS_CUI","ATC_Code","ATC_Name"]],
                         rel2[["UMLS_CUI","ATC_Code","ATC_Name"]]], ignore_index=True).drop_duplicates()

    return atc_map


## 3) Load data and attach CUIs to FAERS ADE table
- Auto-detect FAERS columns: drug, ADE term, PRR, age group.
- Attach CUIs to the drug column via the FAERS standardized map.


In [3]:
# Load inputs
ms_with      = pd.read_csv(MS_WITH_CUI, dtype=str, low_memory=False)
key          = pd.read_csv(FINAL_JOINED, dtype=str, low_memory=False)
faers_map    = pd.read_csv(FAERS_MAP, dtype=str, low_memory=False)

ms_with      = coerce_cui_col(ms_with)
key          = coerce_cui_col(key)
key          = coerce_pt_col(key)

print("ms_with columns:", list(ms_with.columns))
print("final_joined columns:", list(key.columns))
print("faers_map columns:", list(faers_map.columns))

# Load FAERS ADE file (user must provide)
if not FAERS_ADE.exists():
    raise FileNotFoundError(f"FAERS ADE file not found: {FAERS_ADE}. Please set FAERS_ADE to your file path.")

faers_ade = pd.read_csv(FAERS_ADE, dtype=str, low_memory=False)
print("faers_ade columns:", list(faers_ade.columns))

# Auto-detect columns
drug_col = find_col(faers_ade, DRUG_COL_CANDS) or "drugname"
ade_col  = find_col(faers_ade, ADE_COL_CANDS)  or "pt"
prr_col  = find_col(faers_ade, PRR_COL_CANDS)  or "prr"
age_col  = find_col(faers_ade, AGE_COL_CANDS)  or "age_group"

print("Detected columns -> drug:", drug_col, "| ADE:", ade_col, "| PRR:", prr_col, "| age:", age_col)

# Attach CUI to FAERS ADE by drug name
faers_ade_cui = attach_cui_to_faers_ade(faers_ade, faers_map, faers_drug_col=drug_col, map_drug_col="Drug")
faers_ade_cui = coerce_cui_col(faers_ade_cui)
faers_ade_cui.head()


ms_with columns: ['drug_ms', 'clean_drug', 'base_drug', 'UMLS_CUI', 'Preferred_Term', 'Preferred_TTY']
final_joined columns: ['UMLS_CUI', 'Preferred_Term', 'MS_flag', 'FAERS_flag', 'MPRINT_flag']
faers_map columns: ['Drug', 'clean_drug', 'base_drug', 'UMLS_CUI', 'Preferred_Term', 'Preferred_TTY']


FileNotFoundError: FAERS ADE file not found: /Users/rahurkar.1/Library/CloudStorage/OneDrive-TheOhioStateUniversityWexnerMedicalCenter/FAERS/drug_id_platform/cross_quartz/faers_ade_by_drug_age.csv. Please set FAERS_ADE to your file path.

## 4) MarketScan prescriptions by CUI × age_group
If `age_group` exists in your MS file, we use it. Otherwise, everything is `Unknown` and we count rows as proxy prescriptions (or you can replace with real counts).


In [None]:
# Detect age group in MarketScan
ms_age_col = find_col(ms_with, AGE_COL_CANDS)

ms_tmp = ms_with.copy()
if ms_age_col is None:
    ms_tmp["age_group"] = "Unknown"
    ms_age_col = "age_group"

ms_rx = (ms_tmp[ms_tmp["UMLS_CUI"].notna()]
             .groupby(["UMLS_CUI", ms_age_col], dropna=True)
             .size()
             .rename("prescriptions")
             .reset_index()
             .rename(columns={ms_age_col: "age_group"}))

ms_rx.head()


## 5) MPRINT publication counts by CUI
Tries `mprint_standardized.csv` first, then `mprint_druglist.txt` (expects a `cui` column).


In [None]:
pub_counts = load_mprint_counts(MPRINT_STD, MPRINT_LIST)
pub_counts.head()


## 6) Optional: ATC mapping from UMLS META
If `MRCONSO.RRF` & `MRREL.RRF` are available, we add `ATC_Code` and `ATC_Name` per CUI.


In [None]:
atc_map = build_atc_map_from_umls(MRCONSO_RRF, MRREL_RRF)
if atc_map.empty:
    print("[info] ATC mapping skipped (UMLS META not provided or mapping yielded no rows).")
else:
    print("ATC map rows:", len(atc_map))
atc_map.head()


## 7) Build unified table
- Start from FAERS drug×ADE×age rows (since PRR lives there)
- Join MarketScan `prescriptions` by CUI×age_group
- Join MPRINT `publication_count` by CUI
- Attach `Preferred_Term` from the crosswalk
- Optionally add ATC columns


In [None]:
# Start with FAERS ADE rows
needed_cols = ["UMLS_CUI", drug_col, ade_col, prr_col, age_col]
missing = [c for c in [drug_col, ade_col, prr_col, age_col] if c not in faers_ade_cui.columns]
if missing:
    raise ValueError(f"FAERS ADE file is missing required columns after detection: {missing}")

faers_sel = (faers_ade_cui
             .rename(columns={drug_col:"drug_raw", ade_col:"ADE", prr_col:"PRR", age_col:"age_group"})
             [["UMLS_CUI","drug_raw","ADE","PRR","age_group"]])

# Join MarketScan prescriptions
unified = faers_sel.merge(ms_rx, on=["UMLS_CUI","age_group"], how="left")

# Join MPRINT publication counts
unified = unified.merge(pub_counts, on="UMLS_CUI", how="left")

# Attach Preferred_Term
key_simple = key[["UMLS_CUI","Preferred_Term"]].drop_duplicates()
unified = unified.merge(key_simple, on="UMLS_CUI", how="left")

# Attach ATC if available
if not atc_map.empty:
    unified = unified.merge(atc_map, on="UMLS_CUI", how="left")
else:
    unified["ATC_Code"] = pd.NA
    unified["ATC_Name"] = pd.NA

# Tidy types
if "PRR" in unified.columns:
    try:
        unified["PRR"] = pd.to_numeric(unified["PRR"], errors="coerce")
    except Exception:
        pass
if "prescriptions" in unified.columns:
    try:
        unified["prescriptions"] = unified["prescriptions"].fillna(0).astype("Int64")
    except Exception:
        pass
if "publication_count" in unified.columns:
    try:
        unified["publication_count"] = unified["publication_count"].fillna(0).astype("Int64")
    except Exception:
        pass

unified.head(10)


## 8) Write output + quick QC


In [None]:
OUT_UNIFIED.parent.mkdir(parents=True, exist_ok=True)
unified.to_csv(OUT_UNIFIED, index=False)

print("Wrote:", OUT_UNIFIED)
print("\nRow count:", len(unified))
print("\nSample:")
display(unified.head(20))

print("\nNull rates:")
display(unified.isna().mean().to_frame("null_rate").T)

print("\nTop PRR per drug (sample):")
try:
    display(unified.sort_values(["Preferred_Term","PRR"], ascending=[True, False]).head(20)[
        ["Preferred_Term","ADE","age_group","PRR","prescriptions","publication_count"]
    ])
except Exception:
    pass

OUT_UNIFIED
