In [1]:
import pandas as pd
import altair as alt
from altair import datum

# avoid MaxRowsError
import vegafusion as vf
vf.enable()

vegafusion.enable(mimetype='html', row_limit=10000, embed_options=None)

# Polio cases by state from 1928-1969

**Note: Data is normalized cases per 100,000 people for each state**

- [Downloaded from visdatasets](https://visdatasets.github.io/)
- Original Retrieved from [Project Tycho](https://www.tycho.pitt.edu/); aggregated into yearly values.
- [Good article on visualizations of this data](http://www.randalolson.com/2016/03/04/revisiting-the-vaccine-visualizations/)


In [2]:
polio = pd.read_excel('data/polio_incidence_rates_united_states.xlsx', sheet_name='polio_incidence_rates')
polio.head()

Unnamed: 0,year,state,cases
0,1928,Alabama,2.39
1,1928,Alaska,0.0
2,1928,Arizona,2.61
3,1928,Arkansas,0.52
4,1928,California,5.04


## Timeline of total incidence per year (summed over all states)

In [3]:
alt.Chart(polio).mark_line().encode(
    x = 'year:O',
    y = 'sum(cases):Q'
).properties(
    width = 600
)

## Timeline showing (detail) all states

In [4]:
alt.Chart(polio).mark_line(opacity=0.5).encode(
    x = 'year:O',
    y = 'cases:Q',
    detail = 'state:N',
    tooltip = 'state:N'
).properties(
    width = 600
)

## Making a simple DataFrame to hold the year of the polio vaccine introduction

We'll use this DataFrame for a rule to annotate some charts

In [5]:
vacc = pd.DataFrame([{"Introduction": 1955}])

## Timeline of all states overlayed with mean cases across states

Also adding rule at 1955 introduction of vaccine

#### Note that we can layer visualizations from different DataFrames!

In [6]:
state_lines = alt.Chart(polio).mark_line(
    strokeWidth=0.5,
    color = 'lightgray'
).encode(
    x = 'year:O',
    y = 'cases:Q',
    detail = 'state:N'
).properties(
    width = 500
)

mean_line = alt.Chart(polio).mark_line(strokeWidth=3).encode(
    x = 'year:O',
    y = 'mean(cases):Q'
)

rule = alt.Chart(vacc).mark_rule().encode(
    x='Introduction:O',
    size = alt.value(2)
)

state_lines + mean_line + rule

## Median line with upper and lower quartile boundaries

In [7]:
base = alt.Chart(polio).properties(width=500)

line = base.mark_line().encode(
    x='year:O',
    y='median(cases):Q'
)

confidence_interval = base.mark_area(opacity=0.3).encode(
    x ='year:O',
    y = 'q1(cases)',
    y2 = 'q3(cases)'
)

rule = alt.Chart(vacc).mark_rule().encode(
    x='Introduction:O'
)

confidence_interval + line + rule

## Mean line with 95% confidence intervals

In [8]:
base = alt.Chart(polio).properties(width=500)

line = base.mark_line().encode(
    x='year:O',
    y='mean(cases):Q'
)

confidence_interval = base.mark_area(opacity=0.3).encode(
    x = 'year:O',
    y = 'ci0(cases):Q',
    y2 = 'ci1(cases):Q'
)

rule = alt.Chart(vacc).mark_rule().encode(
    x='Introduction:O'
)

confidence_interval + line + rule

## Heatmap of cases by state and year

[Vega-Lite color schemes](https://vega.github.io/vega/docs/schemes/)

*(Note: To see the trend more clearly, limit the color scale domain from 0-50.)*

In [12]:
alt.Chart(polio).mark_rect().encode(
    x = 'year:O',
    y = 'state:N',
    color = alt.Color('cases:Q').scale(scheme='reds', domain=[0,50])
).properties(
    width = 500,
    height = 500
)

## Heatmap with states sorted by sum of cases

Remember we can sort a Y axis by following the alt.Y() with .sort()

```
alt.Y(__).sort(field="__", op="__", order="__")
```

In [14]:
heatmap = alt.Chart(polio).mark_rect().encode(
    x = 'year:O',
    y = alt.Y('state').sort(field="cases", op="sum", order="descending"),
    color = alt.Color('cases:Q').scale(scheme='reds', domain=[0,50])
).properties(
    width = 500,
    height = 500
)

rule = alt.Chart(vacc).mark_rule().encode(
    x='Introduction:O',
    size=alt.value(2)
)

heatmap + rule