In [None]:
%load_ext autoreload
%autoreload 2
%aimport utils_1_0

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
from os.path import join
import datetime
import dateutil.parser

from constants_1_0 import COLUMNS, DATA_AGGREGATE_TYPES
from utils_1_0 import (
    read_combined_daily_counts_df, 
    read_combined_by_country_daily_counts_df,
    apply_theme, get_country_color_map,
    get_visualization_subtitle
)
from web import for_website

In [None]:
CATEGORY = "category"
CATEGORY_OF_INTEREST = "new_positive_cases"

country_color_map = get_country_color_map()

min_date = datetime.datetime(2020, 1, 27) + datetime.timedelta(hours=1)
max_date = datetime.datetime(2020, 3, 30) + datetime.timedelta(hours=1)

# Countries have different ids in the JHU data than in the 4CE data
country_map = {
    "US": "USA"
}

In [None]:
def preprocess_daily_df(df_dc):
    # Adapted from 02_daily_counts_altair.ipynb
    CATEGORY = "category"

    # Wide to long
    df_dc = pd.melt(df_dc, id_vars=[
        COLUMNS.SITE_ID, COLUMNS.DATE,
        COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
        COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS,
        COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.UNMASKED_SITES_NEW_DEATHS,
        COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.MASKED_SITES_NEW_DEATHS
    ])
    df_dc = df_dc.rename(columns={"variable": CATEGORY, "value": COLUMNS.NUM_PATIENTS})

    # Leave only the 'upper' and 'under' values for the certain 'category' only
    for c in [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.PATIENTS_IN_ICU, COLUMNS.NEW_DEATHS]:
        filter_c = df_dc[CATEGORY] == c
        df_dc.loc[filter_c, "upper"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c]
        df_dc.loc[filter_c, "under"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS]
        df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c] / 2.0
        
        # Add num of sites
        df_dc.loc[filter_c, COLUMNS.NUM_SITES] = df_dc["unmasked_sites_" + c] + df_dc["masked_sites_" + c]

    # Drop unused columns
    df_dc = df_dc.drop(columns=[
        COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
        COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS,
        COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.UNMASKED_SITES_NEW_DEATHS,
        COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.MASKED_SITES_NEW_DEATHS
    ])
    
    # Remove zero num_sites
    df_dc = df_dc[df_dc[COLUMNS.NUM_SITES] != 0]

    df_dc = df_dc.loc[df_dc["category"] == CATEGORY_OF_INTEREST]
    df_dc = df_dc.rename(columns={"siteid": "country", "num_patients": "count"})

    return df_dc


# DailyCounts-CombinedByCountry.csv
df_dc = pd.read_csv("https://ndownloader.figshare.com/files/22346625")

df_dc = preprocess_daily_df(df_dc)
df_dc.head()

In [None]:
# We only need the JHU data for the countries that exist in the 4CE data.
unique_countries = df_dc["country"].unique().tolist()
unique_countries

In [None]:
# Parse date strings into date objects.
def convert_date(date_str):
    try:
        return dateutil.parser.parse(date_str)
    except:
        return np.nan

In [None]:
# Transform the JHU data.
jhu_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/dcd4181613f512a6f75249fc77b63286aebe7271/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
jhu_df = pd.read_csv(jhu_url)

jhu_df = jhu_df.rename(columns={"Country/Region": "country", "Province/State": "state"})
jhu_df = jhu_df.drop(columns=["Lat", "Long"])

jhu_df["country"] = jhu_df["country"].apply(lambda c: country_map[c] if c in country_map else c)
jhu_df = jhu_df.loc[jhu_df["country"].isin(unique_countries)]
jhu_df = jhu_df.loc[~pd.notna(jhu_df["state"])]
jhu_df = jhu_df.drop(columns=["state"])

jhu_df = jhu_df.melt(id_vars=["country"], var_name="date", value_name="cumulative_count")
jhu_df["date"] = jhu_df["date"].astype(str)
jhu_df["date"] = jhu_df["date"].apply(convert_date)
jhu_df = jhu_df.sort_values(by="date", ascending=True)
jhu_df = jhu_df.loc[(jhu_df["date"] >= min_date) & (jhu_df["date"] <= max_date)]

jhu_df_freeze = jhu_df.copy()

jhu_roc_df = pd.DataFrame(index=[], data=[], columns=["country", "date", "cumulative_count"])
for country, country_df in jhu_df.groupby("country"):
    country_df = country_df.copy()
    country_df["change"] = np.concatenate((np.array([np.nan]), np.diff(country_df["cumulative_count"].values)))
    country_df["cumulative_count"] = country_df["cumulative_count"].replace(0, np.nan)
    
    cumulative_count_max = country_df["cumulative_count"].max()
    
    country_df["change"] = country_df["change"] / cumulative_count_max

    jhu_roc_df = jhu_roc_df.append(country_df, ignore_index=True)
jhu_roc_df.head()

In [None]:
# Transform the 4CE data to obtain normalized change values.
df_dc = df_dc.loc[df_dc["category"] == CATEGORY_OF_INTEREST]
df_dc["date"] = df_dc["date"].astype(str)
df_dc["date"] = df_dc["date"].apply(convert_date)
df_dc = df_dc.sort_values(by="date", ascending=True)
df_dc = df_dc.loc[(df_dc["date"] >= min_date) & (df_dc["date"] <= max_date)]

df_dc_freeze = df_dc.copy()

dc_roc_df = pd.DataFrame(index=[], data=[], columns=["country", "date", "count"])
for country, country_df in df_dc.groupby("country"):
    country_df = country_df.copy()
    country_df["cumulative_count"] = np.cumsum(country_df["count"].values)
    country_df["cumulative_count"] = country_df["cumulative_count"].replace(0, np.nan)
    
    country_df["cumulative_upper"] = np.cumsum(country_df["upper"].values)
    country_df["cumulative_upper"] = country_df["cumulative_upper"].replace(0, np.nan)
    
    country_df["cumulative_under"] = np.cumsum(country_df["under"].values)
    country_df["cumulative_under"] = country_df["cumulative_under"].replace(0, np.nan)
    
    
    cumulative_count_max = country_df["cumulative_count"].max()
    
    country_df["change"] = country_df["count"] / cumulative_count_max
    
    country_df["change_upper"] = country_df["upper"] / cumulative_count_max
    country_df["change_under"] = country_df["under"] / cumulative_count_max
    

    dc_roc_df = dc_roc_df.append(country_df, ignore_index=True)
dc_roc_df.head()

### Visualization of normalized change and country cumulative counts

In [None]:
dc_roc_df = dc_roc_df.loc[(dc_roc_df["date"] >= min_date) & (dc_roc_df["date"] <= max_date)]
jhu_roc_df = jhu_roc_df.loc[(jhu_roc_df["date"] >= min_date) & (jhu_roc_df["date"] <= max_date)]

color_scale = alt.Scale(domain=list(country_color_map.keys()), range=list(country_color_map.values()))

country_selection = alt.selection_multi(fields=["country"], bind="legend")
country = alt.condition(country_selection, alt.Color("country:N", scale=color_scale, legend=alt.Legend(title="Country")), alt.value("#EAEAEA"))

date_domain = [alt.DateTime(year=min_date.year, month=min_date.month, date=min_date.day), alt.DateTime(year=max_date.year, month=max_date.month, date=max_date.day)]
date_scale = alt.X("date:T", scale=alt.Scale(domain=date_domain), title="Date")


pct_domain = [0.0, 0.25]
count_domain = [1, 1000000]

plot = (
    (
        (
            alt.Chart(dc_roc_df)
                .mark_line()
                .encode(
                    x=date_scale,
                    y=alt.Y("change:Q", scale=alt.Scale(domain=pct_domain), title="Normalized Change"),
                    color=country
                )
                .properties(title="Rate of Change per Country (4CE)")
            +
            alt.Chart(dc_roc_df)
                .mark_errorband()
                .encode(
                    x=date_scale,
                    y=alt.Y("change_upper:Q", scale=alt.Scale(domain=pct_domain), title="Normalized Change"), 
                    y2="change_under:Q",
                    color=country
                )
        ).resolve_scale(y="shared").interactive()
    | 
        (
            alt.Chart(dc_roc_df)
                .mark_line()
                .encode(
                    x=date_scale,
                    y=alt.Y("cumulative_count:Q", scale=alt.Scale(type="log", domain=count_domain), title="Cumulative Count"),
                    color=country
                )
                .properties(title="Country Cumulative Counts (4CE)")   
            +
            alt.Chart(dc_roc_df)
                .mark_errorband()
                .encode(
                    x=date_scale,
                    y=alt.Y("cumulative_upper:Q", scale=alt.Scale(type="log", domain=count_domain), title="Cumulative Count"), 
                    y2="cumulative_under:Q",
                    color=country
                )   
        ).resolve_scale(color="shared", y="shared", x="shared")
    ) & (
    alt.Chart(jhu_roc_df)
        .mark_line()
        .encode(
            x=date_scale,
            y=alt.Y("change:Q", scale=alt.Scale(domain=pct_domain), title="Normalized Change"),
            color=country
        )
        .properties(title="Rate of Change per Country (JHU)")
     | alt.Chart(jhu_roc_df)
        .mark_line()
        .encode(
            x=date_scale,
            y=alt.Y("cumulative_count:Q", scale=alt.Scale(type="log", domain=count_domain), title="Cumulative Count"),
            color=country
        )
        .properties(title="Country Cumulative Counts (JHU)")
    )
).add_selection(
    country_selection
)

plot

### Transform data for plots faceted by country

In [None]:
jhu_roc_df = jhu_roc_df.copy()
dc_roc_df = dc_roc_df.copy()
jhu_roc_df["source"] = "JHU CSSE"
dc_roc_df["source"] = "4CE"

In [None]:
jhu_roc_df.head()

In [None]:
dc_roc_df.head()

In [None]:
join_df = jhu_roc_df.append(dc_roc_df, ignore_index=True)
join_df["country_source"] = join_df.apply(lambda row: row["country"] + "_" + row["source"], axis='columns')
join_df.head()

### Visualization of normalized change faceted by country, with number of sites data

In [None]:
title = "New Positive Cases, Change by Country, Comparison to JHU CSSE Data"

source_selection = alt.selection_multi(fields=["source"], bind="legend")

date_domain = [alt.DateTime(year=min_date.year, month=min_date.month, date=min_date.day), alt.DateTime(year=max_date.year, month=max_date.month, date=max_date.day)]

sites_domain = [0, dc_roc_df["num_sites"].max() + 1]
patients_domain = [0, dc_roc_df["count"].max() + 1]

country_names = list(country_color_map.keys())
country_source_names = [c + "_" + "4CE" for c in country_names] + [c + "_" + "JHU CSSE" for c in country_names]
color_scale = alt.Scale(domain=country_names, range=list(country_color_map.values()))
join_color_scale = alt.Scale(domain=country_source_names, range=list(country_color_map.values()) + ["#707070"]*len(country_names))

country_width = 170

nearest = alt.selection_single(encodings=['x', 'y'], on="mouseover", nearest=True, empty="none", clear="mouseout")
y_selection = alt.selection_interval(encodings=["y"], bind="scales")
date_brush = alt.selection(type='interval', encodings=['x'])

tooltip = [
    alt.Tooltip("source", title="Data source"),
    alt.Tooltip("country", title="Country"),
    alt.Tooltip("change", title="Normalized change", format=".3f"),
    alt.Tooltip("num_sites", title="Number of sites"),
    alt.Tooltip("count", title="Number of new cases"),
    alt.Tooltip("date", title="Date"),
]

rule = alt.Chart().mark_rule(color="red", size=0.5).encode(
    x="date:T"
).transform_filter(
    nearest
)

line = (
    alt.Chart(join_df)
        .transform_filter(source_selection)
        .transform_filter(date_brush)
        .mark_line(opacity=0.7)
        .encode(
            x=alt.X("date:T", title=None, axis=alt.Axis(labelBound=True), scale=alt.Scale(padding=5)),
            y=alt.Y("change:Q", axis=alt.Axis(title="Normalized change"), scale=alt.Scale(zero=False, nice=False, padding=5)),
            strokeDash=alt.StrokeDash("source:N", scale=alt.Scale(domain=["4CE", "JHU CSSE"], range=[[0,0], [3,3]]), 
            legend=alt.Legend(title="Data Source")),
            color=alt.Color("country_source:N", scale=join_color_scale, legend=None),
            tooltip=tooltip
        )
        .properties(width=country_width, height=200)
)
circle = (
    line.mark_circle()
        .encode(
            size=alt.condition(~nearest, alt.value(5), alt.value(30))
        )
        .add_selection(nearest)
)

num_sites_bar_bg = (
    alt.Chart(dc_roc_df)
        .mark_bar(size=2)
        .encode(
            x=alt.X("date:T", scale=alt.Scale(domain=date_domain, padding=5), title=None, axis=alt.Axis(labelBound=True)),
            y=alt.Y("num_sites:Q", axis=alt.Axis(title="# of sites"), scale=alt.Scale(domain=sites_domain)),
            color=alt.value("gray"),
            tooltip=tooltip
        )
        .properties(width=country_width, height=60) 
)
num_sites_bar = (
    num_sites_bar_bg
        .encode(
            color=alt.Color("country:N", scale=color_scale, legend=None),
        )
        .transform_filter(date_brush)
)

num_patients_bar_bg = (
    alt.Chart(dc_roc_df)
        .mark_bar(size=2)
        .encode(
            x=alt.X("date:T", scale=alt.Scale(domain=date_domain, padding=5), title=None, axis=alt.Axis(labelBound=True)),
            y=alt.Y("count:Q", axis=alt.Axis(title="# of new cases"), scale=alt.Scale(domain=patients_domain)),
            color=alt.value("gray"),
            tooltip=tooltip
        )
        .properties(width=country_width, height=60) 
)
num_patients_bar = (
    num_patients_bar_bg
        .encode(
            color=alt.Color("country:N", scale=color_scale, legend=None),
        )
        .transform_filter(date_brush)
)


top = (
    alt.layer(line, circle, rule, data=join_df)
        .facet(
            column=alt.Column("country:N"), bounds="flush" #header=alt.Header(labels=False)
        )
        .add_selection(y_selection)
)

num_sites_bottom = (
    alt.layer(num_sites_bar_bg, num_sites_bar, rule, data=dc_roc_df)
        .facet(
            column=alt.Column("country:N", header=alt.Header(labels=False)), bounds="flush"
        )
        .add_selection(nearest)
        .add_selection(date_brush)
)

num_patients_bottom = (
    alt.layer(num_patients_bar_bg, num_patients_bar, rule, data=dc_roc_df)
        .facet(
            column=alt.Column("country:N", header=alt.Header(labels=False)), bounds="flush"
        )
        .add_selection(nearest)
        .add_selection(date_brush)
)

plot = (
    alt.vconcat(top, num_patients_bottom, num_sites_bottom, spacing=5)
        .resolve_scale(color="shared", x="independent")
        .properties(title={
                "text": title, 
                "subtitle": get_visualization_subtitle(),
                "subtitleColor": "gray",
                "dx": 60
        })
        .add_selection(source_selection)
)

plot = apply_theme(
    plot, 
    axis_label_font_size=10, 
    axis_title_font_size=12, 
    axis_title_padding=8, 
    legend_orient="bottom", 
    legend_symbol_type="stroke",
    legend_title_orient="left",
    legend_title_font_size=14,
    label_font_size=12
).configure_header(title=None, labelPadding=3, labelFontSize=13)

for_website(plot, "Daily Count", "Normalized change by country")

plot