#   Chronic Disease in America
####  Milestone Submission (Nov 18, 2024)

Applied Data Visualization (COMP5960 - Fall 2024)

Team Members: Chetan Elenki, Kalpana Simhadri, Nathaniel Masson

### 1. America's Health Landscape - National Overview

- Opening with a national choropleth map showing overall chronic disease burden
- Key narrative points:
     * Distribution of major chronic conditions across the country
     * Identification of "hot spots" and "cold spots"
     * Initial patterns that raise questions for more exploration
- Visualization strategy:
     * Interactive choropleth - lets you explore national level patterns for different health outcomes
     * An accompanying box plot to show prevalence for each state, soreted by highest to lowest
     * An accompaying line chart to show trend-line of the given health outocme over the past 5 years.

### 2. Regional Stories: The Geography of Health Disparities
- Diving into distinct regional patterns
- Key narrative elements:
    * The "Stroke Belt" in the Southeast
    * Diabetes patterns in the Southwest
    * Heart disease clusters in the Rust Belt
    * Respiratory health issues in urban corridors
- Visualization strategy:
    * Interactive scatterplot - lets you explore comparative prevalence among different groups of states

### 3. The Urban-Rural Divide: Two Americas?
- Exploring how health patterns shift across the urban-rural continuum
- Key narrative points:
    * Access to healthcare differences
    * Lifestyle-related health outcomes
    * Economic factors and health correlations
- Visualization strategy:
    * Interactive parallel coordinates plot - surfaces trends and disparities between urban and rural areas

In [4]:
import sys
from pathlib import Path

# Setup paths
PROJECT_ROOT = Path().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

In [7]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from typing import Dict, List

from config import *

def create_health_choropleth(
    df: pd.DataFrame,
    measures: List[str] = ['DIABETES_CrudePrev', 'OBESITY_CrudePrev', 'BPHIGH_CrudePrev', 'STROKE_CrudePrev']
) -> go.Figure:
    """
    Creates an interactive choropleth map of health outcomes using Plotly
    
    Parameters:
    -----------
    df : pd.DataFrame
        PLACES dataset
    measures : List[str]
        List of health measures to visualize
    
    Returns:
    --------
    fig : go.Figure
        Interactive Plotly figure
    """
    # Calculate state-level statistics
    state_stats = df.groupby(['StateDesc', 'StateAbbr'])[measures].agg(['mean', 'std', 'count']).reset_index()

    # Flatten column names
    state_stats.columns = [
        f"{col[0]}_{col[1]}" if isinstance(col, tuple) and col[1] else col[0]
        for col in state_stats.columns
    ]
    
    # Create figure
    fig = go.Figure()
    
    # Add choropleth traces for each measure
    for measure in measures:
        measure_name = measure.replace('_CrudePrev', '')
        display_name = {
            'DIABETES': 'Diabetes',
            'OBESITY': 'Obesity',
            'BPHIGH': 'High Blood Pressure',
            'STROKE': 'Stroke'
        }.get(measure_name, measure_name)
        
        fig.add_trace(
            go.Choropleth(
                locations=state_stats['StateAbbr'],
                z=state_stats[f'{measure}_mean'],
                locationmode='USA-states',
                colorscale='Reds',
                name=display_name,
                zmin=state_stats[f'{measure}_mean'].min(),
                zmax=state_stats[f'{measure}_mean'].max(),
                visible=True if measure == measures[0] else False,
                colorbar_title="Prevalence (%)",
                hovertemplate=(
                    "<b>%{location}</b><br>" +
                    "Prevalence: %{z:.1f}%<br>" +
                    "<extra></extra>"
                )
            )
        )
    
    # Update layout
    fig.update_layout(
        title={
            'text': 'Chronic Disease Prevalence Across the United States',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 24}
        },
        geo=dict(
            scope='usa',
            projection_type='albers usa',
            showlakes=True,
            lakecolor='rgb(255, 255, 255)'
        ),
        width=1000,
        height=600,
        updatemenus=[{
            'buttons': [
                {
                    'method': 'update',
                    'label': measure.replace('_CrudePrev', ''),
                    'args': [
                        {'visible': [i == j for j in range(len(measures))]},
                        {'title': f'{measure.replace("_CrudePrev", "")} Prevalence by State'}
                    ]
                } for i, measure in enumerate(measures)
            ],
            'direction': 'down',
            'showactive': True,
            'x': 0.1,
            'y': 1.1
        }]
    )
    
    return fig

def main():
    # Load data
    file_path = get_file_path(2024)
    df = pd.read_csv(file_path)
    
    # Create visualizations
    choropleth = create_health_choropleth(df)
    
    return choropleth

# Create and display visualizations
choropleth = main()
choropleth.show()

## County Level Choropleth Maps

![County Map](county_map.png "County Choropleth")

## PCP Plot (Urban/Rural analysis)
#### (Work In Progress)

![PCP Plot](pcp_plot.png "Urban-Rural patterns")

In [6]:
import altair as alt
import pandas as pd
import numpy as np
from vega_datasets import data
from typing import Dict, List, Tuple

from config import HEALTH_MEASURES_CONFIG, get_file_path

def create_regional_health_analysis(
    df: pd.DataFrame,
    health_measures_config: Dict[str, Dict[str, str]] = HEALTH_MEASURES_CONFIG
) -> alt.Chart:
    """
    Creates an interactive visualization of regional health disparities focusing on
    specific patterns and regional clusters.
    """
    # Define regions
    regions = {
        'Southeast': ['AL', 'GA', 'NC', 'SC', 'TN', 'MS', 'AR', 'LA'],  # Stroke Belt
        'Southwest': ['AZ', 'NM', 'TX', 'OK'],
        'Rust_Belt': ['PA', 'OH', 'MI', 'IN', 'IL', 'WI'],
        'Urban_Corridor': ['NY', 'NJ', 'MA', 'CT', 'RI', 'MD', 'DC']
    }
    
    # Add region column
    df['Region'] = df['StateAbbr'].map({
        state: region
        for region, states in regions.items()
        for state in states
    })
    
    # Calculate regional statistics
    regional_stats = []
    
    for measure in ['STROKE', 'DIABETES', 'CHD', 'COPD']:
        measure_col = f'{measure}_CrudePrev'
        
        # Regional aggregation
        region_data = df.groupby('Region').agg({
            measure_col: ['mean', 'std', 'min', 'max']
        }).reset_index()
        
        # Flatten column names
        region_data.columns = ['Region'] + [f'{measure}_{stat}' for stat in ['mean', 'std', 'min', 'max']]
        regional_stats.append(region_data)
    
    # Merge all measures
    regional_data = regional_stats[0]
    for data in regional_stats[1:]:
        regional_data = regional_data.merge(data, on='Region')
    
    # Create state-level statistics
    state_stats = []
    
    for measure in ['STROKE', 'DIABETES', 'CHD', 'COPD']:
        measure_col = f'{measure}_CrudePrev'
        
        # State aggregation
        state_data = df.groupby(['StateAbbr', 'StateDesc', 'Region']).agg({
            measure_col: ['mean', 'std', 'min', 'max']
        }).reset_index()
        
        # Flatten column names
        state_data.columns = ['StateAbbr', 'StateDesc', 'Region'] + [
            f'{measure}_{stat}' for stat in ['mean', 'std', 'min', 'max']
        ]
        state_stats.append(state_data)
    
    # Merge all measures
    state_data = state_stats[0]
    for data in state_stats[1:]:
        state_data = state_data.merge(data, on=['StateAbbr', 'StateDesc', 'Region'])
    
    # Create measure selection
    measure_select = alt.binding_select(
        options=[
            'STROKE_mean',
            'DIABETES_mean',
            'CHD_mean',
            'COPD_mean'
        ],
        labels=[
            'Stroke Prevalence',
            'Diabetes Prevalence',
            'Heart Disease Prevalence',
            'COPD Prevalence'
        ],
        name="Health Measure: "
    )
    
    selection = alt.param(
        name='health_measure',
        value='STROKE_mean',
        bind=measure_select
    )
    
    # Create regional comparison chart
    regional_chart = alt.Chart(regional_data).mark_bar().encode(
        x=alt.X('value:Q', 
                title='Prevalence (%)',
                scale=alt.Scale(zero=False)),
        y=alt.Y('Region:N', 
                title='Region',
                sort='-x'),
        color=alt.Color('Region:N', 
                       legend=None,
                       scale=alt.Scale(scheme='category10')),
        tooltip=[
            alt.Tooltip('Region:N', title='Region'),
            alt.Tooltip('value:Q', title='Prevalence (%)', format='.1f'),
            alt.Tooltip('std:Q', title='Std Dev', format='.2f')
        ]
    ).transform_calculate(
        value=f'datum[{selection.name}]',
        std=f'datum[{selection.name.replace("mean", "std")}]'
    ).properties(
        width=600,
        height=200,
        title='Regional Comparison'
    )
    
    # Create state-level detail chart
    state_chart = alt.Chart(state_data).mark_circle(size=100).encode(
        x=alt.X('value:Q', 
                title='Prevalence (%)',
                scale=alt.Scale(zero=False)),
        y=alt.Y('StateDesc:N', 
                title='State',
                sort=alt.EncodingSortField(field='value', op='mean', order='descending')),
        color=alt.Color('Region:N',
                       scale=alt.Scale(scheme='category10')),
        size=alt.Size('std:Q', 
                     title='Variation',
                     scale=alt.Scale(range=[50, 300])),
        tooltip=[
            alt.Tooltip('StateDesc:N', title='State'),
            alt.Tooltip('Region:N', title='Region'),
            alt.Tooltip('value:Q', title='Prevalence (%)', format='.1f'),
            alt.Tooltip('std:Q', title='Std Dev', format='.2f')
        ]
    ).transform_calculate(
        value=f'datum[{selection.name}]',
        std=f'datum[{selection.name.replace("mean", "std")}]'
    ).properties(
        width=600,
        height=400,
        title='State-Level Detail'
    )
    
    # Combine visualizations
    final_viz = alt.vconcat(
        regional_chart,
        state_chart,
        spacing=20
    ).properties(
        title={
            'text': 'Regional Health Disparities in the United States',
            'subtitle': [
                'Exploring patterns in the Stroke Belt, Southwest, Rust Belt, and Urban Corridors',
                'Use dropdown to explore different health measures'
            ],
            'fontSize': 20,
            'anchor': 'middle'
        }
    ).add_params(
        selection
    )
    
    return final_viz

def main():
    # Enable Altair to work in Jupyter
    alt.renderers.enable('default')
    
    # Load data
    file_path = get_file_path(2024)
    df = pd.read_csv(file_path)
    
    # Create visualization
    chart = create_regional_health_analysis(df)
    
    # Display in notebook
    display(chart)
    
    return chart

# For Jupyter notebook
alt.renderers.enable('default')
chart = main()