In [2]:
import os
print(os.getcwd())

/Users/fariham/Downloads/mimic-iv-3.1/hosp


In [4]:
import pandas as pd
import numpy as np

# 1. Load the pharmacy table (Original 17.8M rows)
df_pharm = pd.read_csv('pharmacy.csv', low_memory=False)
print(f"Initial Pharmacy Load: {len(df_pharm)} rows.")

# 2. Load the prescriptions table (The "Backup" data)
df_presc = pd.read_csv('prescriptions.csv', usecols=['pharmacy_id', 'drug', 'ndc'], low_memory=False)

# 3. Create 'Lookups' from prescriptions
# We drop duplicates to ensure 1 pharmacy_id only gives us 1 answer
name_lookup = df_presc.dropna(subset=['drug']).drop_duplicates('pharmacy_id').set_index('pharmacy_id')['drug']
ndc_lookup = df_presc.dropna(subset=['ndc']).drop_duplicates('pharmacy_id').set_index('pharmacy_id')['ndc']

# 4. Ensure the 'ndc' column exists in pharmacy before filling
if 'ndc' not in df_pharm.columns:
    df_pharm['ndc'] = np.nan

# 5. Fill the missing values WITHOUT adding new rows
# .map() looks up the value in the dictionary and plugs it into the empty spot
df_pharm['medication'] = df_pharm['medication'].fillna(df_pharm['pharmacy_id'].map(name_lookup))
df_pharm['ndc'] = df_pharm['ndc'].fillna(df_pharm['pharmacy_id'].map(ndc_lookup))

# 6. Final Cleanup
# Drop rows where we couldn't find a name in either table
df_pharm.dropna(subset=['medication'], inplace=True)

print(f"Processing Complete. Final Row Count: {len(df_pharm)}")

Initial Pharmacy Load: 17847567 rows.
Processing Complete. Final Row Count: 17779115


In [8]:
import re
def get_base_name(name):
    if pd.isna(name): return name
    name = str(name).lower().strip()
    
    # 1. Remove parentheses content
    name = re.sub(r'\(.*?\)', '', name)
    
    # 2. Improved Noise Patterns (handles spaces and decimals)
    noise_patterns = [
        r'\d+\.?\d*\s?mg', r'\d+\.?\d*\s?mcg', r'\d+\.?\d*\s?%', # 500mg, 0.9 %, 10mcg
        r'\d+\s?unit', r'tab', r'cap', r'liquid', r'vial', 
        r'syringe', r'iv', r'po', r'prn', r'ext', r'ec'
    ]
    
    for pattern in noise_patterns:
        name = re.sub(pattern, '', name)
    
    # 3. Clean up extra spaces left behind
    name = re.sub(r'\s+', ' ', name)
    return name.strip()

# Now apply it to your stable 17.8M row dataframe
df_pharm['medication_base'] = df_pharm['medication'].apply(get_base_name)

In [10]:
# Filter for rows with no NDC, then count the medication names
missing_ndc_list = df_pharm[df_pharm['ndc'].isna()]['medication_base'].value_counts()

print("Top 20 Medications missing NDC codes:")
print(missing_ndc_list.head(20))

Top 20 Medications missing NDC codes:
medication_base
insulin pump           2882
sodium chloride        1743
profol                  528
symbicort               516
phenylephrine           335
norepinephrine          276
venetoclax              249
ibrutinib               240
tassium chloride        237
rytary                  220
fentanyl                218
ruxolitinib             206
profol /100ml 100ml     175
combigan                164
melatonin               161
midazolam               161
levemir                 157
lumigan                 154
sodium bicarbonate      144
acetaminophen           142
Name: count, dtype: int64


In [12]:
# 1. Total count of missing NDCs
missing_count = df_pharm['ndc'].isna().sum()
total_rows = len(df_pharm)
percent_missing = (missing_count / total_rows) * 100

print(f"Total Rows: {total_rows:,}")
print(f"Rows missing NDC: {missing_count:,}")
print(f"Percentage missing: {percent_missing:.2f}%")

# 2. Top medications missing codes
missing_ndc_by_med = df_pharm[df_pharm['ndc'].isna()]['medication_base'].value_counts()
print("\nTop 10 medications without barcodes:")
print(missing_ndc_by_med.head(10))

Total Rows: 17,779,115
Rows missing NDC: 33,523
Percentage missing: 0.19%

Top 10 medications without barcodes:
medication_base
insulin pump        2882
sodium chloride     1743
profol               528
symbicort            516
phenylephrine        335
norepinephrine       276
venetoclax           249
ibrutinib            240
tassium chloride     237
rytary               220
Name: count, dtype: int64


# This step recovers missing barcodes (NDCs) by looking at other rows where the same medication did have a code.

In [14]:
# 1. Create a map of Base Name -> Most Common NDC
ndc_ref = df_pharm.dropna(subset=['ndc']).drop_duplicates('medication_base').set_index('medication_base')['ndc']

# 2. Fill the remaining blanks using the clean names
df_pharm['ndc'] = df_pharm['ndc'].fillna(df_pharm['medication_base'].map(ndc_ref))

print(f"Remaining missing NDCs: {df_pharm['ndc'].isna().sum()}")

Remaining missing NDCs: 10747
