# Subgroup Analysis Example

This notebook demonstrates how to perform subgroup analysis to evaluate treatment effects across different patient populations.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add the scripts directory to the path
sys.path.append(os.path.join(os.pardir, 'scripts'))
sys.path.append(os.path.join(os.pardir, 'scripts', 'models'))
sys.path.append(os.path.join(os.pardir, 'scripts', 'core'))

In [None]:
# Define a function for subgroup analysis
def perform_subgroup_analysis(psa_data, subgroups, treatment_cols=['strategy', 'cost', 'effect'], wtp=50000):
    """
    Perform subgroup analysis to evaluate treatment effects across different patient populations.
    
    Args:
        psa_data: DataFrame with PSA results including subgroup indicators
        subgroups: List of subgroup column names
        treatment_cols: Columns representing treatment information
        
    Returns:
        DataFrame with subgroup analysis results
    """
    results = []
    
    # Group by subgroups and treatment strategies
    grouping_cols = subgroups + ['strategy']
    grouped = psa_data.groupby(grouping_cols)
    
    for name, group in grouped:
        subgroup_info = dict(zip(grouping_cols, name))
        
        # Calculate summary statistics
        mean_cost = group['cost'].mean()
        mean_effect = group['effect'].mean()
        cost_std = group['cost'].std()
        effect_std = group['effect'].std()
        
        # Calculate net monetary benefit
        nmb = mean_effect * wtp - mean_cost
        
        result = {**subgroup_info, 
                 'mean_cost': mean_cost, 
                 'mean_effect': mean_effect,
                 'cost_std': cost_std,
                 'effect_std': effect_std,
                 'net_monetary_benefit': nmb}
        
        results.append(result)
    
    return pd.DataFrame(results)

In [None]:
# Create mock PSA data with subgroups
np.random.seed(42)
n_draws = 1000
strategies = ['ECT', 'IV-KA', 'PO-KA']
subgroups = ['age_young', 'age_old', 'severity_low', 'severity_high']

mock_data = []

# Generate data for each strategy and subgroup
for strategy in strategies:
    for subgroup in subgroups:
        # Base parameters with variation by subgroup
        base_cost = 5000 if strategy == 'ECT' else 7000 if strategy == 'IV-KA' else 6000
        base_effect = 0.6 if strategy == 'ECT' else 0.8 if strategy == 'IV-KA' else 0.7
        
        # Adjust parameters based on subgroup
        if 'young' in subgroup:
            effect_multiplier = 1.1  # Younger patients have better response
            cost_multiplier = 0.95
        elif 'old' in subgroup:
            effect_multiplier = 0.9  # Older patients have reduced response
            cost_multiplier = 1.05
        elif 'low' in subgroup:
            effect_multiplier = 1.0
            cost_multiplier = 0.9   # Less severe cases are cheaper to treat
        else:  # high severity
            effect_multiplier = 0.95
            cost_multiplier = 1.1   # More severe cases cost more
        
        # Generate draws with some variance
        for i in range(n_draws):
            cost = np.random.normal(base_cost * cost_multiplier, base_cost * 0.1)
            effect = np.random.normal(base_effect * effect_multiplier, base_effect * 0.05)
            
            # Ensure non-negative values
            cost = max(cost, 100)
            effect = max(effect, 0.01)
            
            mock_data.append({
                'draw': i,
                'strategy': strategy,
                'age_group': 'young' if 'young' in subgroup else 'old',
                'severity': 'low' if 'low' in subgroup else 'high',
                'cost': cost,
                'effect': effect
            })

psa_df = pd.DataFrame(mock_data)
psa_df['perspective'] = 'health_system'

# Perform subgroup analysis
subgroup_results = perform_subgroup_analysis(psa_df, ['age_group', 'severity'])
print(subgroup_results.head(10))

In [None]:
# Calculate relative effectiveness and cost-effectiveness within subgroups
ref_strategy = 'ECT'  # Reference strategy

# Calculate incremental values compared to reference
subgroup_results['inc_cost'] = 0.0
subgroup_results['inc_effect'] = 0.0
subgroup_results['icer'] = np.inf
subgroup_results['nmb_vs_ref'] = 0.0

# For each subgroup, calculate incremental values compared to ECT
for _, row in subgroup_results.iterrows():
    current_strategy = row['strategy']
    age_group = row['age_group']
    severity = row['severity']
    
    if current_strategy == ref_strategy:
        continue  # Skip reference strategy
    
    # Get reference values for this subgroup
    ref_row = subgroup_results[(subgroup_results['strategy'] == ref_strategy) & 
                               (subgroup_results['age_group'] == age_group) & 
                               (subgroup_results['severity'] == severity)]
    
    if not ref_row.empty:
        ref_cost = ref_row['mean_cost'].iloc[0]
        ref_effect = ref_row['mean_effect'].iloc[0]
        
        inc_cost = row['mean_cost'] - ref_cost
        inc_effect = row['mean_effect'] - ref_effect
        
        icer = inc_cost / inc_effect if inc_effect != 0 else np.inf
        nmb_vs_ref = row['net_monetary_benefit'] - ref_row['net_monetary_benefit'].iloc[0]
        
        # Update the results for this row
        mask = (subgroup_results['strategy'] == current_strategy) & \
               (subgroup_results['age_group'] == age_group) & \
               (subgroup_results['severity'] == severity)
        
        subgroup_results.loc[mask, 'inc_cost'] = inc_cost
        subgroup_results.loc[mask, 'inc_effect'] = inc_effect
        subgroup_results.loc[mask, 'icer'] = icer
        subgroup_results.loc[mask, 'nmb_vs_ref'] = nmb_vs_ref

print(subgroup_results[['strategy', 'age_group', 'severity', 'mean_cost', 'mean_effect', 'inc_cost', 'inc_effect', 'icer', 'nmb_vs_ref']].head(10))

In [None]:
# Visualize subgroup analysis
fig, ax = plt.subplots(2, 2, figsize=(16, 12))

# 1. Cost by strategy and subgroup
cost_pivot = subgroup_results.pivot_table(
    values='mean_cost', 
    index='strategy', 
    columns=['age_group', 'severity'], 
    aggfunc='mean'
)
sns.heatmap(cost_pivot, annot=True, fmt='.0f', cbar_kws={'label': 'Cost ($AUD)'}, ax=ax[0,0])
ax[0,0].set_title('Mean Cost by Strategy and Subgroup')

# 2. Effect by strategy and subgroup
effect_pivot = subgroup_results.pivot_table(
    values='mean_effect', 
    index='strategy', 
    columns=['age_group', 'severity'], 
    aggfunc='mean'
)
sns.heatmap(effect_pivot, annot=True, fmt='.3f', cbar_kws={'label': 'Effect (QALYs)'}, ax=ax[0,1])
ax[0,1].set_title('Mean Effect by Strategy and Subgroup')

# 3. ICER by strategy and subgroup (excluding reference)
icer_data = subgroup_results[subgroup_results['strategy'] != ref_strategy]
icer_pivot = icer_data.pivot_table(
    values='icer', 
    index='strategy', 
    columns=['age_group', 'severity'], 
    aggfunc='mean'
)
sns.heatmap(icer_pivot, annot=True, fmt='.0f', cbar_kws={'label': 'ICER ($/QALY)'}, ax=ax[1,0], 
            cbar_kws={'label': 'ICER ($/QALY)'})
ax[1,0].set_title(f'ICER vs {ref_strategy} by Strategy and Subgroup')

# 4. NMB vs reference by strategy and subgroup
nmb_pivot = icer_data.pivot_table(
    values='nmb_vs_ref', 
    index='strategy', 
    columns=['age_group', 'severity'], 
    aggfunc='mean'
)
sns.heatmap(nmb_pivot, annot=True, fmt='.0f', cbar_kws={'label': 'NMB vs Ref ($AUD)'}, ax=ax[1,1], 
            center=0, cmap='RdBu_r')
ax[1,1].set_title(f'Net Monetary Benefit vs {ref_strategy} by Strategy and Subgroup')

plt.tight_layout()
plt.show()

In [None]:
# Create forest plot for ICER by subgroup
plt.figure(figsize=(14, 10))

# Prepare data for forest plot
non_ref_data = subgroup_results[subgroup_results['strategy'] != ref_strategy]
non_ref_data = non_ref_data.sort_values(['age_group', 'severity', 'strategy'])

# Create labels for each combination
labels = []
icers = []
strategies = []

for _, row in non_ref_data.iterrows():
    label = f"{row['strategy']} ({row['age_group']}, {row['severity']})"
    labels.append(label)
    icers.append(row['icer'])
    strategies.append(row['strategy'])

# Create the forest plot
y_pos = np.arange(len(labels))
colors = ['blue' if s == 'IV-KA' else 'green' for s in strategies]

plt.hlines(y_pos, 0, icers, colors='lightgray', alpha=0.5)
plt.scatter(icers, y_pos, c=colors, s=100, alpha=0.7, edgecolors='black')
plt.axvline(x=50000, color='red', linestyle='--', label='WTP Threshold ($50,000/QALY)', alpha=0.7)

plt.yticks(y_pos, labels)
plt.xlabel('ICER ($/QALY)')
plt.title('ICER by Strategy and Subgroup (Forest Plot)')
plt.grid(True, alpha=0.3)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Calculate subgroup-specific recommendations
wtp_threshold = 50000

print("Subgroup-Specific Recommendations")
print("="*50)

for subgroup in subgroup_results[['age_group', 'severity']].drop_duplicates().values:
    age, severity = subgroup
    print(f"\nSubgroup: Age Group={age}, Severity={severity}")
    
    sub_data = subgroup_results[(subgroup_results['age_group'] == age) & 
                                (subgroup_results['severity'] == severity)]
    
    # Identify cost-effective strategies
    ce_strategies = sub_data[(sub_data['strategy'] != ref_strategy) & 
                             (sub_data['icer'] <= wtp_threshold) & 
                             (sub_data['icer'] > 0)]
    
    if not ce_strategies.empty:
        print("  Cost-effective strategies compared to ECT:")
        for _, row in ce_strategies.iterrows():
            print(f"    - {row['strategy']}: ICER = ${row['icer']:,.0f}/QALY, NMB vs ECT = ${row['nmb_vs_ref']:,.0f}")
        
        # Find the most cost-effective option
        best_option = ce_strategies.loc[ce_strategies['nmb_vs_ref'].idxmax()]
        print(f"  Recommended option: {best_option['strategy']} (NMB = ${best_option['nmb_vs_ref']:,.0f})")
    else:
        print("  No strategies are cost-effective compared to ECT at the given WTP threshold")
        # Find the strategy with highest NMB even if not cost-effective
        best_nmb = sub_data.loc[sub_data['nmb_vs_ref'].idxmax()]
        if best_nmb['strategy'] != ref_strategy:
            print(f"  Strategy with highest NMB: {best_nmb['strategy']} (NMB = ${best_nmb['nmb_vs_ref']:,.0f})")
        else:
            print(f"  ECT remains the best option (NMB = ${best_nmb['nmb_vs_ref']:,.0f})")

## Next Steps

1. Integrate with real patient-level data
2. Apply appropriate statistical methods for subgroup identification
3. Consider multiple comparison corrections
4. Evaluate interaction effects between subgroups