In [1]:
%pylab inline
import altair as alt
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

alt.data_transformers.disable_max_rows()

def days_between(d1, d2):
    if (isinstance(d1, str) and isinstance(d2, str)):
        d1 = datetime.strptime(d1, "%m-%d-%Y")
        d2 = datetime.strptime(d2, "%m-%d-%Y")
        return (d2 - d1).days
    return None;

Populating the interactive namespace from numpy and matplotlib


In [81]:
total_cases_df       = pd.read_csv("../91-DIVOC/pages/covid-visualization/jhu-data.csv",index_col=0)
empty_rows           = total_cases_df[(total_cases_df["Confirmed"]==0)&(total_cases_df["Recovered"]==0)&(total_cases_df["Active"]==0)&(total_cases_df["Deaths"]==0)].index
total_cases_df       = total_cases_df.drop(empty_rows, axis=0)

# Tabulate the top K countries with the highest total number of Confirmed cases
K              = 10  
topKcountries  = list(total_cases_df.groupby("Country_Region").sum()["Confirmed"].nlargest(K).index)

quarantine_df   = pd.read_csv("quarantine-activity.csv",index_col=0).set_index('Country_Region')
full_lockdown   = list(quarantine_df[quarantine_df['Lockdown Type'] == 'Full'].groupby("Country_Region")['Date Enacted'].min().index)

full_lockdown  = list (set(full_lockdown) - set(topKcountries))[4:]
topKcountries  = topKcountries[5:]
topKcountries  = list (set(full_lockdown + topKcountries) - set(['Japan', 'United States', 'China', 'Germany']))


cases_df       = total_cases_df[total_cases_df["Country_Region"].isin(topKcountries)]


# Compute days since N = 50 confirmed cases; drop those with no N confirmed cases
N              = 30
days_since_N                   = cases_df[cases_df["Confirmed"]>N].groupby("Country_Region")["Date"].min().to_dict()
cases_df['Date N Confirmed']   = cases_df.apply  (lambda x: days_since_N.get(x['Country_Region']),axis=1)
cases_df                       = cases_df.dropna (subset=['Date N Confirmed'])
cases_df['Days from N']        = cases_df.apply  (lambda x: days_between(x["Date N Confirmed"], x["Date"]), axis = 1)
cases_df

Unnamed: 0,Country_Region,Province_State,Confirmed,Recovered,Active,Deaths,Date,Date N Confirmed,Days from N
6,Malaysia,,3.0,0.0,3.0,0.0,01-25-2020,03-04-2020,-39
3,France,,3.0,0.0,3.0,0.0,01-27-2020,02-27-2020,-31
3,France,,4.0,0.0,4.0,0.0,01-28-2020,02-27-2020,-30
4,France,,5.0,0.0,5.0,0.0,01-29-2020,02-27-2020,-29
10,Malaysia,,7.0,0.0,7.0,0.0,01-29-2020,03-04-2020,-35
...,...,...,...,...,...,...,...,...,...
73,India,,499.0,34.0,455.0,10.0,03-23-2020,03-06-2020,17
78,Israel,,1442.0,41.0,1400.0,1.0,03-23-2020,03-09-2020,14
123,Poland,,749.0,13.0,728.0,8.0,03-23-2020,03-16-2020,7
140,South Africa,,402.0,4.0,398.0,0.0,03-23-2020,03-17-2020,6


In [82]:
# Adding in information about lockdowns & possibly drop rows without lockdowns

quarantine_df   = pd.read_csv("quarantine-activity.csv",index_col=0).set_index('Country_Region')
full_lockdown   = quarantine_df[quarantine_df['Lockdown Type'] == 'Full'].groupby("Country_Region")['Date Enacted'].min().to_dict()

# Add date of full lockdown
cases_df['Date of Full Lockdown']     = cases_df.apply  (lambda x: full_lockdown.get(x['Country_Region']),axis=1)
cases_df['Day of Lockdown from N']    = cases_df.apply  (lambda x: days_between(x["Date N Confirmed"], x["Date of Full Lockdown"]), axis = 1)

# Drop rows for those without a full lockdown 
# cases_df                              = cases_df.dropna (subset=['Date of Full Lockdown'])

cases_df

Unnamed: 0,Country_Region,Province_State,Confirmed,Recovered,Active,Deaths,Date,Date N Confirmed,Days from N,Date of Full Lockdown,Day of Lockdown from N
6,Malaysia,,3.0,0.0,3.0,0.0,01-25-2020,03-04-2020,-39,3-16-2020,12.0
3,France,,3.0,0.0,3.0,0.0,01-27-2020,02-27-2020,-31,3-16-2020,18.0
3,France,,4.0,0.0,4.0,0.0,01-28-2020,02-27-2020,-30,3-16-2020,18.0
4,France,,5.0,0.0,5.0,0.0,01-29-2020,02-27-2020,-29,3-16-2020,18.0
10,Malaysia,,7.0,0.0,7.0,0.0,01-29-2020,03-04-2020,-35,3-16-2020,12.0
...,...,...,...,...,...,...,...,...,...,...,...
73,India,,499.0,34.0,455.0,10.0,03-23-2020,03-06-2020,17,3-24-2020,18.0
78,Israel,,1442.0,41.0,1400.0,1.0,03-23-2020,03-09-2020,14,3-19-2020,10.0
123,Poland,,749.0,13.0,728.0,8.0,03-23-2020,03-16-2020,7,3-13-2020,-3.0
140,South Africa,,402.0,4.0,398.0,0.0,03-23-2020,03-17-2020,6,3-26-2020,9.0


In [4]:
# Basic chart of top 10 countries 
top_10_df      = cases_df[cases_df["Country_Region"].isin(topKcountries[:10])]
chart          = alt.Chart(top_10_df).mark_line().encode(
                         x = alt.X("Date"),
                         y = alt.Y("Confirmed",aggregate="sum"),
                     color = alt.Color("Country_Region")
)
# chart.interactive()
chart

In [5]:
# Testing
x = cases_df.iloc[100,:]
print (x)
days_between(days_since_N[x["Country_Region"]],x["Date"])
# df = df[df["Days Since First 100 Confirmed"]>0]

Country_Region            United Kingdom
Province_State                       NaN
Confirmed                              9
Recovered                              8
Active                                 1
Deaths                                 0
Date                          02-17-2020
Date N Confirmed              03-03-2020
Days from N                          -15
Date of Full Lockdown          3-23-2020
Day of Lockdown from N                20
Name: 25, dtype: object


-15

In [6]:
chart = alt.Chart(cases_df).mark_line().encode(
            x = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
            y = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases"),
        color = alt.Color("Country_Region")
)
chart.interactive()

In [12]:
# Logarithmic Axis (Old Version)
chart = alt.Chart(cases_df).mark_line().encode(
        x = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(100,100000))),
    color = alt.Color("Country_Region")
)
chart.interactive()

In [107]:
# Logarithmic Axis (Stephen's Prettier Version)

base = alt.Chart(cases_df, width=800, height=500)
line = base.mark_line(point = True).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,20)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(50,100000))),
    color   = "Country_Region",
    shape   = "Country_Region",
#    tooltip =["Country_Region"]
)
rule = base.mark_rule().encode(
        x   = alt.X("Day of Lockdown from N",scale=alt.Scale(domain=(0,20)), title = "Days Since First 50 Confirmed"),
        color   = "Country_Region",
)
line + rule.interactive()

In [97]:
# Logarithmic Axis (Stephen's Prettier Version)

base = alt.Chart(cases_df, width=500, height=50)
line = base.mark_line(point = True).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,20)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Cases(log)", scale=alt.Scale(type='log',domain=(50,100000))),
        color   = "Country_Region",
        row = alt.Row("Country_Region")
)
# rule = base.mark_rule().encode(
#         x   = alt.X("Day of Lockdown from N",scale=alt.Scale(domain=(0,20)), title = "Days Since First 50 Confirmed"),
#         color   = "Country_Region",
#         row = alt.Row("Country_Region")
# )
line.interactive()

In [106]:
base = alt.Chart(width=500, height=50)
line = base.mark_line(point = True).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,20)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Cases(log)", scale=alt.Scale(type='log',domain=(50,100000))),
)
rule = base.mark_line().encode(
         alt.X("Day of Lockdown from N",scale=alt.Scale(domain=(0,20)), title = "Days Since First 50 Confirmed")
         
)

alt.layer(line, rule, data = cases_df).facet(column="Country_Region")

AttributeError: 'Chart' object has no attribute 'layer'

In [8]:
cases_usa      = total_cases_df[total_cases_df["Country_Region"]=="United States"]
# cases_usa = cases_usa.dropna()

# Tabulate the top L states with the highest total number of Confirmed cases
L              = 50  
topLstates     = list(cases_usa.groupby("Province_State").sum()["Confirmed"].nlargest(L).index)
cases_usa      = cases_usa[cases_usa["Province_State"].isin(topLstates)]

# Compute days since M = 20 confirmed cases; drop those with no M confirmed cases
M              = 20
days_since_M                   = cases_usa[cases_usa["Confirmed"]>M].groupby("Province_State")["Date"].min().to_dict()
cases_usa['Date M Confirmed']  = cases_usa.apply  (lambda x: days_since_M.get(x['Province_State']),axis=1)
cases_usa                      = cases_usa.dropna (subset=['Date M Confirmed'])
cases_usa['Days from M']       = cases_usa.apply  (lambda x: days_between(x["Date M Confirmed"], x["Date"]), axis = 1)
cases_usa

Unnamed: 0,Country_Region,Province_State,Confirmed,Recovered,Active,Deaths,Date,Date M Confirmed,Days from M
34,United States,Washington,1.0,0.0,1.0,0.0,01-22-2020,03-10-2020,-48
34,United States,Washington,1.0,0.0,1.0,0.0,01-23-2020,03-10-2020,-47
34,United States,Washington,1.0,0.0,1.0,0.0,01-24-2020,03-10-2020,-46
33,United States,Illinois,1.0,0.0,1.0,0.0,01-25-2020,03-15-2020,-50
34,United States,Washington,1.0,0.0,1.0,0.0,01-25-2020,03-10-2020,-45
...,...,...,...,...,...,...,...,...,...
111,United States,South Carolina,298.0,0.0,0.0,5.0,03-23-2020,03-15-2020,8
112,United States,South Dakota,28.0,0.0,0.0,1.0,03-23-2020,03-22-2020,1
113,United States,Tennessee,614.0,0.0,0.0,2.0,03-23-2020,03-13-2020,10
118,United States,Virginia,254.0,0.0,0.0,6.0,03-23-2020,03-14-2020,9


In [9]:
chart = alt.Chart(cases_usa).mark_line().encode(
        x = alt.X("Days from M",scale=alt.Scale(domain=(0,12)), title = "Days Since First 20 Confirmed"),
        y = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(50,20000))),
    color = alt.Color("Province_State")
)
chart.interactive()