# Comparing State Trajectories to USA Average - Death Rate

In [15]:
import pandas as pd
import altair as alt
from IPython.display import HTML

In [20]:
#hide
#read in state populations and corona data
raw_data = {'State': ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'], 'Estimated_Pop': [4903185, 731545, 7278717, 3017804, 39512223, 5758736, 3565287, 973764, 705749, 21477737, 10617423, 1415872, 1787065, 12671821, 6732219, 3155070, 2913314, 4467673, 4648794, 1344212, 6045680, 6892503, 9986857, 5639632, 2976149, 6137428, 1068778, 1934408, 3080156, 1359711, 8882190, 2096829, 19453561, 10488084, 762062, 11689100, 3956971, 4217737, 12801989, 1059361, 5148714, 884659, 6829174, 28995881, 3205958, 623989, 8535519, 7614893, 1792147, 5822434, 578759, 3193694]}
pops = pd.DataFrame(raw_data, columns = ['State', 'Estimated_Pop'])
url = ('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/'
       'csse_covid_19_time_series/time_series_19-covid-Deaths.csv')
df0 = pd.read_csv(url)
df1 = df0[df0['Country/Region'] =="US"] # Make US-specific
df2 = df1[~df1['Province/State'].isin(['Cruise Ship'])]   # Remove Ships
pops["Population_millions"] = pops["Estimated_Pop"]/1000000

In [21]:
#hide
# Joining 
df3 = df2.merge(pops, how='inner', right_on="State", left_on="Province/State")
df = df3.iloc[:,4:-3].div(df3.Population_millions, axis=0)*10
df['Province/State'] = df3['Province/State']
dt_cols = df3.columns[~df3.columns.isin(['Province/State', 'Country/Region', 'Lat', 'Long', "State", "Population", "Estimated_Pop","Population_millions"])]

In [22]:
#hide
dff = (df.groupby('Province/State')[dt_cols].sum()
       .stack().reset_index(name='Confirmed Cases per 100,000')
       .rename(columns={'level_1': 'Date', 'Province/State': 'State'}))
dff['Date'] = pd.to_datetime(dff['Date'], format='%m/%d/%y')

In [23]:
#hide
MIN_CASES = 1
LAST_DATE = dt_cols[-1]
# sometimes last column may be empty, then go backwards
for c in dt_cols[::-1]:
    if not df[c].fillna(0).eq(0).all():
        LAST_DATE = c
        break
states = dff[dff['Date'].eq(LAST_DATE) & dff['Confirmed Cases per 100,000'].ge(MIN_CASES)
       ].sort_values(by='Confirmed Cases per 100,000', ascending=False)
states = states['State'].values

In [24]:
#hide
SINCE_CASES_NUM = 1
COL_X = f'Days since {SINCE_CASES_NUM} death per 100,000 residents'
dff2 = dff[dff['State'].isin(states)].copy()
days_since = (dff2.assign(F=dff2['Confirmed Cases per 100,000'].ge(SINCE_CASES_NUM))
              .set_index('Date')
              .groupby('State')['F'].transform('idxmax'))
dff2[COL_X] = (dff2['Date'] - days_since.values).dt.days.values
dff2 = dff2[dff2[COL_X].ge(0)]

In [25]:
#hide
#Creating USA average
USA_avg = dff2.groupby(f'Days since {SINCE_CASES_NUM} death per 100,000 residents', as_index=False).mean()
USA_avg["State"] = "USA avg."
dates = dff2[dff2['State'] =="Washington"]
USA_avg['Date'] = list(dates['Date'])
dff2 = dff2.append(USA_avg)

In [26]:
#hide_input
baseline_states = ['California', 'Washington', 'New York', 'USA avg.']
max_date = dff2['Date'].max()

def make_since_chart(highlight_states=[], baseline_states=baseline_states):
    selection = alt.selection_multi(fields=['State'], bind='legend', 
                                    init=[{'State': x} for x in highlight_states + baseline_states])

    base = alt.Chart(dff2, width=550).encode(
        x=f'{COL_X}:Q',
        y=alt.Y('Confirmed Cases per 100,000:Q', scale=alt.Scale(type='log'), axis=alt.Axis(title='Cumulative Deaths per 100,000 residents')),
        color=alt.Color(
            'State:N',
            sort=["USA avg.", 'Washington'],
            legend=alt.Legend(columns=2)),
        tooltip=list(dff2),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
    )
    max_day = dff2[COL_X].max()
    ref = pd.DataFrame([[x, SINCE_CASES_NUM*1.33**x] for x in range(max_day+1)], columns=[COL_X, 'Confirmed Cases'])
    base_ref = alt.Chart(ref).encode(x=f'{COL_X}:Q', y='Confirmed Cases:Q')
    return (
        base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) +
        base_ref.transform_filter(
            alt.datum[COL_X] >= max_day
        ).mark_text(dy=-6, align='right', fontSize=10, text='33% Daily Growth') +
        base.mark_line(point=True).add_selection(selection) + 
        base.transform_filter(
            alt.datum['Date'] >= int(max_date.timestamp() * 1000)
        ).mark_text(dy=-8, align='right', fontWeight='bold').encode(text='State:N')
    ).properties(
        title=f"Compare {', '.join(highlight_states)} death trajectory with {', '.join(baseline_states)}"
    )

In [27]:
make_since_chart()

Click (Shift+ for multiple) on Countries legend to filter the visualization.

95% of this by [Pratap Vardhan](https://twitter.com/PratapVardhan)[^1]
State-level adaptation and calculation of USA average by [Jared Valdron](https://www.linkedin.com/in/jaredvaldron/)

["2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE"](https://systems.jhu.edu/research/public-health/ncov/) [GitHub repository](https://github.com/CSSEGISandData/COVID-19).

["Estimated State Populations by US Census"](https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html#par_textimage_1574439295) 