In [1]:
import altair as alt
import pandas as pd
import numpy as np
import datetime as dt
from covid import dataimport

In [2]:
fhm_data, labels = dataimport.get_lag_data()
local_url = 'data/fhm.json'
fhm_data.to_json(local_url, orient='records')

In [3]:
url = "https://raw.githubusercontent.com/morberg/covid-notebook/master/data/fhm.json"
fhm_data[fhm_data.lag=='0']

Unnamed: 0,date,publication_date,N,days_since_publication,n_diff,n_diff_pct,delay,lag,age,prediction,publication_week
33,2020-04-02,2020-04-02,5,0.0,5.0,,Same day,0,0.0,,14
68,2020-04-03,2020-04-03,2,0.0,2.0,,Same day,0,0.0,,14
104,2020-04-04,2020-04-04,1,0.0,1.0,,Same day,0,0.0,,14
141,2020-04-05,2020-04-05,1,0.0,1.0,,Same day,0,0.0,,14
179,2020-04-06,2020-04-06,13,0.0,13.0,,Same day,0,0.0,,15
...,...,...,...,...,...,...,...,...,...,...,...
6484,2020-06-30,2020-06-30,0,0.0,0.0,0.0,Same day,0,0.0,43.400000,27
6608,2020-07-01,2020-07-01,0,0.0,0.0,0.0,Same day,0,0.0,42.300000,27
6733,2020-07-02,2020-07-02,0,0.0,0.0,0.0,Same day,0,0.0,36.300000,27
6859,2020-07-03,2020-07-03,0,0.0,0.0,0.0,Same day,0,0.0,36.000000,27


# Work in progress

Experimental stuff

In [4]:
def average_lag(df, start_date):
    df1 = pd.DataFrame(df.groupby("publication_date")["n_diff"].sum())
    df1["average_lag"] = (
        df.groupby("publication_date")["age"].sum()
        / df.groupby("publication_date")["n_diff"].sum()
    )
    df1 = df1.reset_index()
    df1 = df1[df1["publication_date"] >= start_date]

    lag_chart = (
        alt.Chart(df1, width=600, title="Average Reporting Lag")
        .mark_trail()
        .encode(
            x=alt.X("publication_date", title="Publication Date"),
            y=alt.Y("average_lag:Q", title="Daily Average Reporting Lag"),
            size=alt.Size("n_diff", title="Reported Deaths"),
        )
    )

    return lag_chart

average_lag(fhm_data, '2020-04-03')

In [5]:
df = fhm_data[fhm_data['date'] > '2020-03-10']
# Only show weekly publication dates based on latest publication date
df = df[df.publication_date.dt.dayofweek == fhm_data['publication_date'].max().weekday()]

alt.Chart(df, width=600).mark_line(interpolate='basis').encode(
    x=alt.X('date', title="Date"),
    y=alt.Y('N', title="Deceased"),
    color=alt.Color('monthdate(publication_date):N', title="Publication Date")
)

In [6]:
df = fhm_data[fhm_data['publication_date'] >= '2020-04-03']
df = df.replace(0, np.nan)

alt.Chart(df,height=200, width=600).mark_rect().encode(
    x='yearmonthdate(publication_date)',
#    y='yearmonthdate(date)',
    y=alt.Y('lag', sort=labels),
    size='n_diff',
    color=alt.Color('n_diff', title='Deaths', scale=alt.Scale(scheme='goldgreen')),
    tooltip=[alt.Tooltip('n_diff', title='Deaths')]
)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

## Daily reported deaths and lag

Number of deaths reported by day and the lag in reporting for each death. Each column is a weekday and each row a week.

In [11]:
week_order = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

hist = alt.Chart(url, height=100, width=100).mark_bar().encode(
    x=alt.X("lag:O", title="Reporting Lag", sort=labels),
    y=alt.Y("sum(n_diff):Q", title="Reported Deaths"),
    color=alt.Color(
        "day(publication_date):N", title="Publication Day", sort=week_order
    ),
)

text = (
    alt.Chart(url)
    .mark_text(align="right", x=95, y=28, fontSize=20)
    .encode(alt.Text("sum(n_diff):Q"),)
)

(hist + text).facet(
    facet=alt.Facet("publication_date:T", title="Reported Deaths per Day"),
    columns=7,
).transform_filter("datum.date >= datetime(2020,3,6)")