# Data Visualisation

The goal of this notebook is to visualize data in a world map and time series format of the target variable in function of the observation variable. It uses data in the `Data/1021/` folder with `incidence` and `prevalence` as target variable. It should be dynamic. 

In [8]:
import pandas as pd
import numpy as np
import plotly.express as px
import os
import glob
from ipywidgets import Dropdown, HBox, VBox, SelectMultiple, Button, Accordion, Output
import IPython.display as display

In [9]:
# Load data and extract metadata
data_folder = 'Data/1021/'
datasets = {
    os.path.basename(f).replace('.csv', ''): pd.read_csv(f)
    for f in glob.glob(os.path.join(data_folder, '*.csv'))
}

# Extract all unique years and countries using set comprehensions
all_years = sorted({year for df in datasets.values() if 'Year' in df.columns for year in df['Year'].unique()})
all_countries = sorted({country for df in datasets.values() if 'Country Name' in df.columns for country in df['Country Name'].unique()})

In [10]:
# Auto-discover available variables from all datasets
standard_cols = {'Year', 'Value', 'Measure'}
observation_variables = sorted({
    col for df in datasets.values() 
    for col in df.select_dtypes(include=[np.number]).columns 
    if col not in standard_cols
})

# Extract disease and metric types from filenames
disease_types = sorted({name.split('_')[0] for name in datasets.keys() if '_' in name})
metric_types = sorted({'_'.join(name.split('_')[1:]) for name in datasets.keys() if '_' in name})

# All variables include Year, metric types, and observation variables
all_variables = ['Year'] + sorted(set(metric_types + observation_variables))

In [11]:
# Helper function to create dropdowns
def create_dropdown(description, options, width='100px'):
    return Dropdown(options=options, description=description, style={'description_width': width})

def create_button_with_action(description, action):
    button = Button(description=description)
    button.on_click(action)
    return button

# Create all dropdowns - X and Y axis have the same options
disease_dropdown = create_dropdown('Disease:', disease_types)
viz_type_dropdown = create_dropdown('Viz Type:', ['World Map', 'Comparison'])
y_var_dropdown = create_dropdown('Y-Axis Var:', all_variables)
x_var_dropdown = create_dropdown('X-Axis Var:', all_variables)
color_by_dropdown = create_dropdown('Color by:', ['Country Name', 'Year'])
country1_dropdown = create_dropdown('Country 1:', ['None'] + all_countries)
country2_dropdown = create_dropdown('Country 2:', ['None'] + all_countries)

# Years selector
years_selector = SelectMultiple(options=all_years, description='Years:', style={'description_width': '100px'})

# Create years control buttons with lambda functions
all_years_button = create_button_with_action('Select All Years', lambda b: setattr(years_selector, 'value', tuple(all_years)))
clear_years_button = create_button_with_action('Clear Years', lambda b: setattr(years_selector, 'value', ()))

# Create accordion for years
accordion = Accordion(
    children=[VBox([years_selector, HBox([all_years_button, clear_years_button])])],
    titles=('Filter by Years',), selected_index=None
)

In [12]:
# Define the visualization function
def create_visualization(disease, y_var, x_var, viz_type, country1, country2, filter_years, color_by):
    y_is_metric = y_var in metric_types
    x_is_metric = x_var in metric_types
    
    # Load dataset(s) and determine column names
    if x_is_metric and y_is_metric and x_var != y_var:
        # Need to merge two metric datasets
        y_dataset_key = f"{disease}_{y_var}"
        x_dataset_key = f"{disease}_{x_var}"
        
        if x_dataset_key not in datasets or y_dataset_key not in datasets:
            return None
        
        df = datasets[y_dataset_key].merge(
            datasets[x_dataset_key], 
            on=['Country Name', 'Country Code', 'Year'], 
            suffixes=('_y', '_x')
        )
        y_col, x_col = 'Value_y', 'Value_x'
    else:
        # Single dataset (load based on whichever variable is a metric, or first disease dataset)
        dataset_key = (f"{disease}_{y_var}" if y_is_metric 
                      else f"{disease}_{x_var}" if x_is_metric
                      else next((k for k in datasets.keys() if k.startswith(f"{disease}_")), None))
        
        if not dataset_key or dataset_key not in datasets:
            return None
        
        df = datasets[dataset_key].copy()
        y_col = 'Value' if y_is_metric else y_var
        x_col = 'Value' if x_is_metric else x_var
    
    # Select countries
    selected_countries = [c for c in [country1, country2] if c != "None"]
    if not selected_countries:
        selected_countries = df['Country Name'].unique().tolist()
    
    # World Map visualization
    if viz_type == 'World Map':
        df_map = df[(df['Year'] >= 2010) & (df['Year'] <= 2021)].sort_values('Year')
        
        fig = px.choropleth(
            df_map, locations='Country Code', color=y_col, hover_name='Country Name',
            animation_frame='Year', color_continuous_scale='Viridis',
            title=f"{disease.capitalize()} - {y_var} by Country (2010-2021)"
        )
        fig.update_layout(
            height=600, 
            geo=dict(showland=True),
            coloraxis=dict(
                cmin=df_map[y_col].min(), 
                cmax=df_map[y_col].max(),
                colorbar=dict(title=y_var)
            )
        )
        return fig
    
    # Comparison visualization
    df_filtered = df.dropna(subset=[x_col, y_col])
    df_filtered = df_filtered[df_filtered['Country Name'].isin(selected_countries)]
    
    if filter_years:
        df_filtered = df_filtered[df_filtered['Year'].isin(filter_years)]
    
    # Create visualization based on x_var type
    if x_var == 'Year':
        df_filtered = df_filtered.sort_values('Year')
        fig = px.line(
            df_filtered, x='Year', y=y_col, color=color_by, markers=True,
            title=f"{disease.capitalize()} - {y_var} over Time ({len(selected_countries)} countries)"
        )
    else:
        color_col = color_by if len(selected_countries) == 1 or color_by == 'Year' else 'Country Name'
        trendline = 'ols' if len(selected_countries) == 1 else None
        fig = px.scatter(
            df_filtered, x=x_col, y=y_col, color=color_col, trendline=trendline,
            title=f"{disease.capitalize()} - {y_var} vs {x_var} ({len(selected_countries)} countries)"
        )
    
    fig.update_layout(height=600, hovermode='closest', xaxis_title=x_var, yaxis_title=y_var)
    return fig

In [13]:
# Use observe to update plot when dropdowns change
output_plot = Output()

def update_plot(change=None):
    output_plot.clear_output(wait=True)
    with output_plot:
        fig = create_visualization(
            disease_dropdown.value, y_var_dropdown.value, x_var_dropdown.value,
            viz_type_dropdown.value, country1_dropdown.value, country2_dropdown.value,
            years_selector.value, color_by_dropdown.value
        )
        if fig:
            fig.show()

In [14]:
# Attach observers to all dropdowns
for widget in [disease_dropdown, y_var_dropdown, x_var_dropdown, viz_type_dropdown,
               country1_dropdown, country2_dropdown, years_selector, color_by_dropdown]:
    widget.observe(update_plot, names='value')

# Display controls and output
display.display(HBox([disease_dropdown, y_var_dropdown, x_var_dropdown, viz_type_dropdown]))
display.display(HBox([country1_dropdown, country2_dropdown, color_by_dropdown]))
display.display(accordion)
display.display(output_plot)

# Trigger initial visualization
update_plot()

HBox(children=(Dropdown(description='Disease:', options=('COPD', 'asthma', 'tuberculosis'), style=DescriptionS…

HBox(children=(Dropdown(description='Country 1:', options=('None', 'Albania', 'Australia', 'Austria', 'Belarus…

Accordion(children=(VBox(children=(SelectMultiple(description='Years:', options=(np.int64(2010), np.int64(2011…

Output()