# MIMIC-IV Pharmacy Data Preprocessing
**Goal:** Clean and standardize the `pharmacy.csv` table to prepare it for Knowledge Graph construction.

### Tasks:
1. Load data and audit missing values.
2. Resolve missing medication names using `prescriptions.csv`.
3. Standardize medication strings (lowercase, stripping noise).

In [None]:
import os
print(os.getcwd())

In [99]:
import pandas as pd
import numpy as np

# 1. Load the pharmacy table (Original 17.8M rows)
df_pharm = pd.read_csv('pharmacy.csv', low_memory=False)
print(f"Initial Pharmacy Load: {len(df_pharm)} rows.")

# 2. Load the prescriptions table (The "Backup" data)
df_presc = pd.read_csv('prescriptions.csv', usecols=['pharmacy_id', 'drug', 'ndc'], low_memory=False)

# 3. Create 'Lookups' from prescriptions
# We drop duplicates to ensure 1 pharmacy_id only gives us 1 answer
name_lookup = df_presc.dropna(subset=['drug']).drop_duplicates('pharmacy_id').set_index('pharmacy_id')['drug']
ndc_lookup = df_presc.dropna(subset=['ndc']).drop_duplicates('pharmacy_id').set_index('pharmacy_id')['ndc']

# 4. Ensure the 'ndc' column exists in pharmacy before filling
if 'ndc' not in df_pharm.columns:
    df_pharm['ndc'] = np.nan

# 5. Fill the missing values WITHOUT adding new rows
# .map() looks up the value in the dictionary and plugs it into the empty spot
df_pharm['medication'] = df_pharm['medication'].fillna(df_pharm['pharmacy_id'].map(name_lookup))
df_pharm['ndc'] = df_pharm['ndc'].fillna(df_pharm['pharmacy_id'].map(ndc_lookup))

# 6. Final Cleanup
# Drop rows where we couldn't find a name in either table
df_pharm.dropna(subset=['medication'], inplace=True)

print(f"Processing Complete. Final Row Count: {len(df_pharm)}")

Initial Pharmacy Load: 17847567 rows.
Processing Complete. Final Row Count: 17779115


In [100]:
import re
def get_base_name(name):
    if pd.isna(name): return name
    name = str(name).lower().strip()
    
    # 1. Remove parentheses content
    name = re.sub(r'\(.*?\)', '', name)
    
    # 2. Improved Noise Patterns (handles spaces and decimals)
    noise_patterns = [
        r'\d+\.?\d*\s?mg', r'\d+\.?\d*\s?mcg', r'\d+\.?\d*\s?%', # 500mg, 0.9 %, 10mcg
        r'\d+\s?unit', r'tab', r'cap', r'liquid', r'vial', 
        r'syringe', r'iv', r'po', r'prn', r'ext', r'ec'
    ]
    
    for pattern in noise_patterns:
        name = re.sub(pattern, '', name)
    
    # 3. Clean up extra spaces left behind
    name = re.sub(r'\s+', ' ', name)
    return name.strip()

# Now apply it to your stable 17.8M row dataframe
df_pharm['medication_base'] = df_pharm['medication'].apply(get_base_name)

In [101]:
# Filter for rows with no NDC, then count the medication names
missing_ndc_list = df_pharm[df_pharm['ndc'].isna()]['medication_base'].value_counts()

print("Top 20 Medications missing NDC codes:")
print(missing_ndc_list.head(20))

Top 20 Medications missing NDC codes:
medication_base
insulin pump           2882
sodium chloride        1743
profol                  528
symbicort               516
phenylephrine           335
norepinephrine          276
venetoclax              249
ibrutinib               240
tassium chloride        237
rytary                  220
fentanyl                218
ruxolitinib             206
profol /100ml 100ml     175
combigan                164
melatonin               161
midazolam               161
levemir                 157
lumigan                 154
sodium bicarbonate      144
acetaminophen           142
Name: count, dtype: int64


In [102]:
# 1. Total count of missing NDCs
missing_count = df_pharm['ndc'].isna().sum()
total_rows = len(df_pharm)
percent_missing = (missing_count / total_rows) * 100

print(f"Total Rows: {total_rows:,}")
print(f"Rows missing NDC: {missing_count:,}")
print(f"Percentage missing: {percent_missing:.2f}%")

# 2. Top medications missing codes
missing_ndc_by_med = df_pharm[df_pharm['ndc'].isna()]['medication_base'].value_counts()
print("\nTop 10 medications without barcodes:")
print(missing_ndc_by_med.head(10))

Total Rows: 17,779,115
Rows missing NDC: 33,523
Percentage missing: 0.19%

Top 10 medications without barcodes:
medication_base
insulin pump        2882
sodium chloride     1743
profol               528
symbicort            516
phenylephrine        335
norepinephrine       276
venetoclax           249
ibrutinib            240
tassium chloride     237
rytary               220
Name: count, dtype: int64


# This step recovers missing barcodes (NDCs) by looking at other rows where the same medication did have a code.

In [104]:
# 1. Create a map of Base Name -> Most Common NDC
ndc_ref = df_pharm.dropna(subset=['ndc']).drop_duplicates('medication_base').set_index('medication_base')['ndc']

# 2. Fill the remaining blanks using the clean names
df_pharm['ndc'] = df_pharm['ndc'].fillna(df_pharm['medication_base'].map(ndc_ref))

print(f"Remaining missing NDCs: {df_pharm['ndc'].isna().sum()}")

Remaining missing NDCs: 10747


In [105]:
import requests
import time

ndcs = ["51079007320", "00002002701"]

for i, ndc in enumerate(ndcs):
    url = f"https://rxnav.nlm.nih.gov/REST/rxcui.json?idtype=NDC&id={ndc}"
    print(f"[{i+1}] Querying {ndc}")

    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        print(r.json())
    except Exception as e:
        print(f"Error for {ndc}: {e}")

    time.sleep(0.1)


[1] Querying 51079007320
{'idGroup': {'rxnormId': ['313988']}}
[2] Querying 00002002701
{'idGroup': {}}


In [106]:
df_pharm[['ndc','medication_base']]

Unnamed: 0,ndc,medication_base
0,51079007320,furosemide
1,487980125,ipratropium bromide neb
2,51079007220,furosemide
3,245004101,tassium chloride
4,0,sodium chloride flush
...,...,...
17847562,0,calcium gluconate sliding scale
17847563,63323029766,profol
17847564,338004938,levetiracetam
17847565,51079000220,acetaminophen


In [107]:
acet_ndc = (
    df_pharm[
        df_pharm['medication_base']
            .astype(str)
            .str.lower()
            .str.contains('acetaminophen', na=False)
    ]['ndc']
)

acet_ndc


7            904198861
20           904198861
96         43825010201
99           121197100
109          904198861
               ...    
17847361   43825010201
17847385     904198261
17847446     182844789
17847491     182844789
17847565   51079000220
Name: ndc, Length: 778167, dtype: float64

In [108]:
unique_acet_ndc = acet_ndc.dropna().astype(str).unique()

unique_acet_ndc



array(['904198861.0', '43825010201.0', '121197100.0', '182844789.0',
       '406035762.0', '51079000220.0', '713016550.0', '121065721.0',
       '182845389.0', '406051262.0', '63481062375.0', '904677361.0',
       '904198261.0', '54864816.0', '904682076.0', '0.0', '904673061.0',
       '904643761.0', '24201010024.0', '68084039665.0', '50268064415.0',
       '603254421.0', '591336901.0', '143178701.0', '68084035501.0',
       '904641961.0', '68084039601.0', '51079016120.0', '63739032610.0',
       '406036562.0', '781315695.0', '904656761.0', '50580041202.0',
       '45802073030.0', '406048462.0', '904693806.0', '45802073032.0',
       '406051201.0', '50268040115.0', '121050410.0', '904653806.0',
       '904682461.0', '904525530.0', '36000030660.0', '54569100100.0',
       '338001711.0', '338004938.0', '50090021500.0', '54368663.0',
       '54569281400.0', '338001702.0', '19810000790.0', '527155201.0',
       '338001738.0', '338004911.0'], dtype=object)

In [109]:
import requests
import time

RXNAV_BASE = "https://rxnav.nlm.nih.gov/REST"

def ndc_to_rxcui(ndc, timeout=10):
    """
    Map a single NDC to RxCUI using RxNav.
    Returns RxCUI string or None if not found.
    """
    url = f"{RXNAV_BASE}/rxcui.json"
    params = {
        "idtype": "NDC",
        "id": ndc
    }

    try:
        r = requests.get(url, params=params, timeout=timeout)
        r.raise_for_status()
        data = r.json()

        rxnorm_ids = data.get("idGroup", {}).get("rxnormId", [])
        return rxnorm_ids[0] if rxnorm_ids else None

    except requests.RequestException as e:
        print(f"Request failed for NDC {ndc}: {e}")
        return None


def map_ndcs_to_rxnorm(ndc_list, sleep=0.1):
    """
    Map a list of NDCs to RxCUIs.
    Returns a dict: {ndc: rxcui_or_None}
    """
    mapping = {}

    for i, ndc in enumerate(ndc_list, start=1):
        print(f"[{i}/{len(ndc_list)}] Mapping NDC {ndc}")
        mapping[ndc] = ndc_to_rxcui(ndc)
        time.sleep(sleep)  # be polite to RxNav

    return mapping


In [110]:

# Suppose your DataFrame column is 'ndc'
ndc_list = (
    df_pharm["ndc"]
    .astype(str)                       # convert float to string
    .str.replace(".0$", "", regex=True)  # remove trailing '.0'
    .str.replace("-", "", regex=False)   # remove dashes if any
    .str.zfill(11)                        # pad to 11 digits
    .unique()
    .tolist()
)


In [111]:
unmapped = sum(
    1 for rxcui in ndc_rxnorm_map.values() if rxcui is None
)

print(f"Unmapped NDCs: {unmapped}")



Unmapped NDCs: 2051


In [112]:
print(ndc_list[:10])


['51079007320', '00487980125', '51079007220', '00245004101', '00000000000', '00006022761', '63739054410', '00904198861', '19515089452', '00173068224']


In [113]:
import pandas as pd
import numpy as np
import re
import requests
import time

# -----------------------------
# 1. Load the pharmacy and prescription tables
# -----------------------------
df_pharm = pd.read_csv('pharmacy.csv', low_memory=False)
print(f"Initial Pharmacy Load: {len(df_pharm)} rows.")

df_presc = pd.read_csv('prescriptions.csv', usecols=['pharmacy_id', 'drug', 'ndc'], low_memory=False)

# -----------------------------
# 2. Create lookup dictionaries from prescriptions
# -----------------------------
name_lookup = df_presc.dropna(subset=['drug']).drop_duplicates('pharmacy_id').set_index('pharmacy_id')['drug']
ndc_lookup = df_presc.dropna(subset=['ndc']).drop_duplicates('pharmacy_id').set_index('pharmacy_id')['ndc']

# -----------------------------
# 3. Ensure the 'ndc' column exists in pharmacy table
# -----------------------------
if 'ndc' not in df_pharm.columns:
    df_pharm['ndc'] = np.nan

# -----------------------------
# 4. Fill missing medication and NDC info
# -----------------------------
df_pharm['medication'] = df_pharm['medication'].fillna(df_pharm['pharmacy_id'].map(name_lookup))
df_pharm['ndc'] = df_pharm['ndc'].fillna(df_pharm['pharmacy_id'].map(ndc_lookup))

# Drop rows where medication name is still missing
df_pharm.dropna(subset=['medication'], inplace=True)
print(f"After filling, row count: {len(df_pharm)}")

# -----------------------------
# 5. Clean medication names
# -----------------------------
def get_base_name(name):
    if pd.isna(name):
        return name
    name = str(name).lower().strip()
    # Remove parentheses
    name = re.sub(r'\(.*?\)', '', name)
    # Remove dosage forms, units, tabs, caps, vials, IV/PO, PRN, etc.
    noise_patterns = [
        r'\d+\.?\d*\s?mg', r'\d+\.?\d*\s?mcg', r'\d+\.?\d*\s?%', 
        r'\d+\s?unit', r'tab', r'cap', r'liquid', r'vial', 
        r'syringe', r'iv', r'po', r'prn', r'ext', r'ec'
    ]
    for pattern in noise_patterns:
        name = re.sub(pattern, '', name)
    # Collapse multiple spaces
    name = re.sub(r'\s+', ' ', name)
    return name.strip()

df_pharm['medication_base'] = df_pharm['medication'].apply(get_base_name)

# -----------------------------
# 6. Clean and normalize NDCs
# -----------------------------
# Convert to string
df_pharm['ndc'] = df_pharm['ndc'].astype(str)
# Remove trailing '.0' from floats
df_pharm['ndc'] = df_pharm['ndc'].str.replace(r'\.0$', '', regex=True)
# Remove non-digit characters (dashes, spaces)
df_pharm['ndc'] = df_pharm['ndc'].str.replace(r'\D', '', regex=True)
# Pad to 11 digits
df_pharm['ndc'] = df_pharm['ndc'].str.zfill(11)
# Drop invalid NDCs
df_pharm = df_pharm[df_pharm['ndc'].str.strip('0') != '']

# -----------------------------
# 7. Extract unique NDCs for mapping
# -----------------------------
ndc_list = df_pharm['ndc'].dropna().unique().tolist()
print(f"Unique NDCs to map: {len(ndc_list)}")
print("Sample NDCs:", ndc_list[:10])

# -----------------------------
# 8. Define RxNav mapping functions
# -----------------------------
RXNAV_BASE = "https://rxnav.nlm.nih.gov/REST"

def ndc_to_rxcui(ndc, timeout=10):
    """Map a single NDC to RxCUI using RxNav."""
    url = f"{RXNAV_BASE}/rxcui.json"
    params = {"idtype": "NDC", "id": ndc}
    try:
        r = requests.get(url, params=params, timeout=timeout)
        r.raise_for_status()
        data = r.json()
        rxnorm_ids = data.get("idGroup", {}).get("rxnormId", [])
        return rxnorm_ids[0] if rxnorm_ids else None
    except requests.RequestException as e:
        print(f"Request failed for NDC {ndc}: {e}")
        return None

def map_ndcs_to_rxnorm(ndc_list, sleep=0.1):
    """Map a list of NDCs to RxCUIs."""
    mapping = {}
    for i, ndc in enumerate(ndc_list, start=1):
        print(f"[{i}/{len(ndc_list)}] Mapping NDC {ndc}")
        mapping[ndc] = ndc_to_rxcui(ndc)
        time.sleep(sleep)  # polite to RxNav
    return mapping

# -----------------------------
# 9. Map all NDCs to RxCUI
# -----------------------------
ndc_rxnorm_map = map_ndcs_to_rxnorm(ndc_list)

# -----------------------------
# 10. Count unmapped NDCs
# -----------------------------
unmapped_count = sum(1 for rxcui in ndc_rxnorm_map.values() if rxcui is None)
print(f"Unmapped NDCs: {unmapped_count}")

# -----------------------------
# 11. Attach RxCUIs back to the main DataFrame
# -----------------------------
df_pharm['rxcui'] = df_pharm['ndc'].map(ndc_rxnorm_map)

# -----------------------------
# 12. Optional: Save preprocessed dataframe
# -----------------------------
df_pharm.to_csv("pharmacy_preprocessed.csv", index=False)
print("Preprocessed DataFrame saved as 'pharmacy_preprocessed.csv'")


Initial Pharmacy Load: 17847567 rows.
After filling, row count: 17779115
Unique NDCs to map: 5668
Sample NDCs: ['51079007320', '00487980125', '51079007220', '00245004101', '00006022761', '63739054410', '00904198861', '19515089452', '00173068224', '61958070101']
[1/5668] Mapping NDC 51079007320
[2/5668] Mapping NDC 00487980125
[3/5668] Mapping NDC 51079007220
[4/5668] Mapping NDC 00245004101
[5/5668] Mapping NDC 00006022761
[6/5668] Mapping NDC 63739054410
[7/5668] Mapping NDC 00904198861
[8/5668] Mapping NDC 19515089452
[9/5668] Mapping NDC 00173068224
[10/5668] Mapping NDC 61958070101
[11/5668] Mapping NDC 63323026201
[12/5668] Mapping NDC 00135019502
[13/5668] Mapping NDC 65649030303
[14/5668] Mapping NDC 00054812025
[15/5668] Mapping NDC 67467064301
[16/5668] Mapping NDC 00597007575
[17/5668] Mapping NDC 00121457735
[18/5668] Mapping NDC 00904272561
[19/5668] Mapping NDC 46287000660
[20/5668] Mapping NDC 60505260400
[21/5668] Mapping NDC 00173071920
[22/5668] Mapping NDC 57664037708

In [114]:
df=pd.read_csv('pharmacy_preprocessed.csv')

  df=pd.read_csv('pharmacy_preprocessed.csv')


In [116]:
df.columns

Index(['subject_id', 'hadm_id', 'pharmacy_id', 'poe_id', 'starttime',
       'stoptime', 'medication', 'proc_type', 'status', 'entertime',
       'verifiedtime', 'route', 'frequency', 'disp_sched', 'infusion_type',
       'sliding_scale', 'lockout_interval', 'basal_rate', 'one_hr_max',
       'doses_per_24_hrs', 'duration', 'duration_interval', 'expiration_value',
       'expiration_unit', 'expirationdate', 'dispensation', 'fill_quantity',
       'ndc', 'medication_base', 'rxcui'],
      dtype='object')

In [118]:
empty_nan = df['rxcui'].isnull()


In [120]:
print("Number of NaN empty cells:", empty_nan.sum())


Number of NaN empty cells: 4988349


In [122]:

# Total number of rows
total_rows = len(df)

# Count empty cells (NaN or empty string)
empty_cells = df['rxcui'].isnull() | (df['rxcui'] == "")
num_empty = empty_cells.sum()

# Fraction or percentage of empty cells
fraction_empty = num_empty / total_rows
percent_empty = fraction_empty * 100

print(f"Empty cells: {num_empty} out of {total_rows} rows")
print(f"Fraction empty: {fraction_empty:.2f}")
print(f"Percentage empty: {percent_empty:.2f}%")


Empty cells: 4988349 out of 15264080 rows
Fraction empty: 0.33
Percentage empty: 32.68%


In [126]:
import pandas as pd

df = df_pharm.copy()  # your dataframe

drug_nodes = (
    df[['ndc','medication_base','rxcui','medication','route','proc_type']]
      .dropna(subset=['ndc'])
      .drop_duplicates()
      .groupby('ndc', as_index=False)
      .first()
      .rename(columns={'ndc': 'ndc:ID(Drug)'})
)

drug_nodes.to_csv("drug_nodes.csv", index=False)
print("âœ… drug_nodes.csv", len(drug_nodes))


âœ… drug_nodes.csv 5668


In [127]:
import pandas as pd

df = df_pharm.copy()

adm_drug_edges = (
    df[['hadm_id','ndc','starttime','stoptime','status','route','frequency','dispensation','fill_quantity','pharmacy_id','poe_id']]
      .dropna(subset=['hadm_id','ndc'])
      .drop_duplicates(subset=['hadm_id','ndc'])   # keep ONE edge per admission+ndc (simple)
      .rename(columns={
          'hadm_id': ':START_ID(Admission)',
          'ndc': ':END_ID(Drug)'
      })
)

adm_drug_edges.to_csv("admission_administered_drug.csv", index=False)
print("âœ… admission_administered_drug.csv", len(adm_drug_edges))


âœ… admission_administered_drug.csv 10136919
