# Age Distribution by Therapy Area in MIMIC3 Demo Dataset

This notebook analyzes the age distribution across different therapy areas using the MIMIC3 demo dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyhealth.datasets import MIMIC3Dataset
from pyhealth.medcode import ICD9CM, CCSCM
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load MIMIC3 demo dataset
print("Loading MIMIC3 demo dataset...")
mimic3_demo = MIMIC3Dataset(
    root="https://physionet.org/files/mimiciii-demo/1.4/",
    tables=["DIAGNOSES_ICD", "PATIENTS", "ADMISSIONS"],
    dev=True  # Use dev mode for small subset
)

print(f"Loaded {len(mimic3_demo.patients)} patients")
print(f"Dataset statistics:")
mimic3_demo.stat()

In [None]:
# Extract patient data: age and diagnoses
print("Extracting patient data...")

patient_data = []

for patient in mimic3_demo.iter_patients():
    # Get patient demographics
    patient_info = patient.get_events(event_type="patients")
    if not patient_info:
        continue
    
    dob = getattr(patient_info[0], 'dob', None)
    
    # Get all diagnoses for this patient
    diagnoses = patient.get_events(event_type="diagnoses_icd")
    
    # Get admission times to calculate age at admission
    admissions = patient.get_events(event_type="admissions")
    
    if not admissions:
        continue
    
    # Use the first admission for age calculation
    admittime = getattr(admissions[0], 'admittime', None)
    
    if dob and admittime:
        # Calculate age in years
        age = (admittime - dob).days / 365.25
        
        # Get all ICD codes for this patient
        icd_codes = [getattr(d, 'icd9_code', '') for d in diagnoses]
        icd_codes = [code for code in icd_codes if code]  # Remove empty
        
        if icd_codes:  # Only include patients with diagnoses
            patient_data.append({
                'patient_id': patient.patient_id,
                'age': age,
                'icd_codes': icd_codes
            })

print(f"Extracted data for {len(patient_data)} patients")

# Convert to DataFrame
df = pd.DataFrame(patient_data)
print(f"DataFrame shape: {df.shape}")
print(df.head())

In [None]:
# Define therapy area mapping based on CCS categories
print("Setting up therapy area mappings...")

# Load mapping from ICD9CM to CCSCM
from pyhealth.medcode import CrossMap
icd9_to_ccs = CrossMap.load("ICD9CM", "CCSCM")

# Define mapping from CCS codes to therapy areas
# Based on Clinical Classifications Software (CCS) categories
therapy_area_mapping = {
    # Oncology (Cancer)
    'Oncology': list(range(10, 18)),  # 10-17: Cancer
    
    # Cardiovascular
    'Cardiovascular': list(range(35, 40)),  # 35-39: Circulatory
    
    # Neurology
    'Neurology': list(range(31, 35)),  # 31-34: Nervous system
    
    # Metabolic/Endocrine
    'Metabolic/Endocrine': list(range(18, 21)),  # 18-20: Endocrine/metabolic
    
    # Rare Diseases (Congenital anomalies)
    'Rare Diseases': list(range(72, 76)),  # 72-75: Congenital anomalies
    
    # Autoimmune/Inflammatory (Blood diseases)
    'Autoimmune/Inflammatory': list(range(21, 26)),  # 21-25: Blood diseases
    
    # Infections Diseases
    'Infections Diseases': list(range(1, 10)),  # 1-9: Infectious diseases
    
    # Pulmonary
    'Pulmonary': list(range(40, 45)),  # 40-44: Respiratory
    
    # Renal (part of Genitourinary)
    'Renal': [50, 51, 52],  # Selected from 50-54: Genitourinary (kidney related)
    
    # Gastrointestinal
    'Gastrointestinal': list(range(45, 50)),  # 45-49: Digestive
}

def map_icd_to_therapy_area(icd_codes):
    """Map a list of ICD codes to therapy areas using CCS."""
    areas = set()
    for code in icd_codes:
        try:
            # Get CCS codes for this ICD code
            ccs_codes = icd9_to_ccs.map(code)
            for ccs_code in ccs_codes:
                if ccs_code:
                    ccs_num = int(ccs_code)
                    # Find which therapy area this CCS code belongs to
                    for area, ccs_range in therapy_area_mapping.items():
                        if ccs_num in ccs_range:
                            areas.add(area)
                            break
        except:
            continue
    return list(areas) if areas else ['Other']

# Apply mapping to DataFrame
print("Mapping ICD codes to therapy areas...")
df['therapy_areas'] = df['icd_codes'].apply(map_icd_to_therapy_area)

# Since patients can have multiple areas, we'll explode the DataFrame
# to have one row per patient per therapy area
df_exploded = df.explode('therapy_areas')

print(f"After exploding: {df_exploded.shape}")
print(df_exploded.head())

In [None]:
# Create age groups
print("Creating age groups...")

# Define age bins
age_bins = [0, 18, 30, 45, 60, 75, 100]
age_labels = ['0-17', '18-29', '30-44', '45-59', '60-74', '75+']

df_exploded['age_group'] = pd.cut(df_exploded['age'], bins=age_bins, labels=age_labels, right=False)

print("Age group distribution:")
print(df_exploded['age_group'].value_counts().sort_index())

print("\nTherapy area distribution:")
print(df_exploded['therapy_areas'].value_counts())

In [None]:
# Plot histogram of age distribution by therapy area
print("Creating histogram plot...")

# Filter out 'Other' category for cleaner visualization
plot_df = df_exploded[df_exploded['therapy_areas'] != 'Other']

# Create the plot
plt.figure(figsize=(14, 10))

# Create histogram with hue for therapy areas
g = sns.histplot(
    data=plot_df,
    x='age',
    hue='therapy_areas',
    multiple='stack',
    bins=30,
    alpha=0.7
)

plt.title('Age Distribution by Therapy Area in MIMIC3 Demo Dataset', fontsize=16, fontweight='bold')
plt.xlabel('Age (years)', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.legend(title='Therapy Area', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Print counts for each therapy area and age group
print("Counts for each therapy area and age group:")
print("=" * 60)

# Create cross-tabulation
cross_tab = pd.crosstab(
    df_exploded['therapy_areas'],
    df_exploded['age_group'],
    margins=True,
    margins_name='Total'
)

print(cross_tab)

print("\n" + "=" * 60)
print("Summary:")
print(f"Total patients: {len(df)}")
print(f"Total patient-therapy area combinations: {len(df_exploded)}")
print(f"Average therapy areas per patient: {len(df_exploded) / len(df):.2f}")
print(f"Age range: {df_exploded['age'].min():.1f} - {df_exploded['age'].max():.1f} years")
print(f"Median age: {df_exploded['age'].median():.1f} years")