In [None]:
%load_ext autoreload
%autoreload 2
%aimport utils_1_1

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
import datetime
import dateutil.parser
from os.path import join

from constants_1_1 import SITE_FILE_TYPES
from utils_1_1 import (
    read_loinc_df,
    get_site_file_paths,
    get_site_file_info,
    get_site_ids,
    read_full_daily_counts_df,
    get_visualization_subtitle,
    get_country_color_map,
    apply_theme,
    merge_single_site_country_name,
)
from web import for_website

alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

## The 4CE Health Systems Participating spreadsheet must be downloaded from Google Sheets and moved to

```
data/Health_Systems_Participating.csv
```

because we need the number of hospitals, beds, and inpatient discharges per year in order to compute the country-level rates of change.

In [None]:
min_date = datetime.datetime(2020, 1, 27) + datetime.timedelta(hours=1)
max_date = datetime.datetime(2020, 8, 16) + datetime.timedelta(hours=1)

In [None]:
DATA_RELEASE = "2020-09-19"
COHORT = "Adult"
MERGE_SINGLE_SITE_COUNTRIES = True

In [None]:
CATEGORY = "category"
CATEGORY_OF_INTEREST = "new_positive_cases"

In [None]:
COUNTRY_POPULATION = {
    # From https://data.worldbank.org/indicator/SP.POP.TOTL
    "France": 67059887,
    "USA": 328239523,
    "Germany": 83132799,
    "Italy": 60297396,
    "Singapore": 5703569,
    "Spain": 47076781,
    "UK": 66834405,
}
COUNTRY_HOSP_DISCHARGE = {
    # From https://data.oecd.org/healthcare/hospital-discharge-rates.htm
    "France": 18553.0,
    "USA": 10906.2, # https://hcup-us.ahrq.gov/faststats/NationalTrendsServlet
    "Germany": 25478.4,
    "Italy": 11414.6,
    "Singapore": 12700.4, # https://www.moh.gov.sg/resources-statistics/healthcare-institution-statistics/hospital-admission-rates-by-age-and-sex/hospital-admission-rates-by-age-and-sex-2017
    "Spain": 10470.5,
    "UK": 12869.4,
}

In [None]:
COUNTRY_POPULATION["Germany + Singapore + UK"] = COUNTRY_POPULATION["Germany"] + COUNTRY_POPULATION["Singapore"] + COUNTRY_POPULATION["UK"]
COUNTRY_HOSP_DISCHARGE["Germany + Singapore + UK"] = COUNTRY_HOSP_DISCHARGE["Germany"] + COUNTRY_HOSP_DISCHARGE["Singapore"] + COUNTRY_HOSP_DISCHARGE["UK"]

In [None]:
df = read_full_daily_counts_df()
df.head()

## Remove pediatric sites

In [None]:
df = df.loc[df["pediatric"] == False]
df = df.drop(columns=["pediatric"])
df.head()

In [None]:
# Remove RP401 non-pediatric data since RP401 is only listed as a pediatric site
df = df.loc[~df["siteid"].isin(["RP401"])]

In [None]:
df = df.replace(-99, np.nan)
df = df.replace(-999, np.nan)

In [None]:
df["num_sites"] = 1

In [None]:
# We only need the JHU data for the countries that exist in the 4CE data.
COUNTRIES = df["country"].unique().tolist()
COUNTRIES

In [None]:
df["date"] = df["calendar_date"]
df = df.drop(columns=["calendar_date"])

## Load participating sites metadata

In [None]:
sites_df = pd.read_csv(join("..", "data", "Health_Systems_Participating.tsv"), sep='\t', skiprows=2, header=None, thousands=',')
sites_column_map = {
    0: "site_name",
    1: "siteid",
    2: "city",
    3: "country",
    4: "patient_type",
    6: "adult_num_hosp",
    7: "adult_num_beds",
    8: "adult_num_yearly_discharge",
    10: "ped_num_hosp",
    11: "ped_num_beds",
    12: "ped_num_yearly_discharge",
}
sites_df = sites_df.rename(columns=sites_column_map)
sites_df = sites_df[list(sites_column_map.values())]
sites_df["pediatric"] = sites_df["patient_type"].apply(lambda t: t == "Pediatric")
sites_df = sites_df.dropna(subset=["site_name"])
sites_df.tail()

In [None]:
# Drop the pediatric hospitals
sites_df = sites_df.loc[sites_df["pediatric"] == False]

## Take intersection of sites that have provided valid num_yearly_discharge counts and sites that have provided daily counts data

In [None]:
sites_df["adult_num_hosp"] = sites_df["adult_num_hosp"].apply(lambda x: str(x).replace(",", "")).astype(float)
sites_df["adult_num_beds"] = sites_df["adult_num_beds"].apply(lambda x:  str(x).replace(",", "")).astype(float)
sites_df["adult_num_yearly_discharge"] = sites_df["adult_num_yearly_discharge"].apply(lambda x:  str(x).replace(",", "")).astype(float)

In [None]:
sites_df

In [None]:
sites_df = sites_df.dropna(subset=["adult_num_yearly_discharge"])
sites_in_sites_df = sites_df["siteid"].unique().tolist()
sites_in_df = df["siteid"].unique().tolist()

intersecting_sites = set(sites_in_sites_df).intersection(set(sites_in_df))
sites_df = sites_df.loc[sites_df["siteid"].isin(intersecting_sites)]
df = df.loc[df["siteid"].isin(intersecting_sites)]

intersecting_sites

In [None]:
# Get number of sites after restricting to pediatrics
# and after taking the intersection
NUM_SITES = len(df["siteid"].unique().tolist())

In [None]:
sites_in_sites_df

## If site is missing data for a particular date, use the most recent previous data point for that date

In [None]:
def convert_date(date_str):
    try:
        return dateutil.parser.parse(date_str)
    except:
        return np.nan

In [None]:
max_date_str = str(max_date).split(" ")[0]

all_date_country_df = pd.DataFrame()
for siteid, cd_df in df.groupby(["siteid"]):
    min_date = cd_df["date"].min()
    min_date_str = str(min_date).split(" ")[0]
    
    num_days = (dateutil.parser.parse(max_date_str) - dateutil.parser.parse(min_date_str)).days
    
    cd_df = cd_df.copy()
    cd_df["date"] = cd_df["date"].astype(str)
    
    prev_date_row = None
    for day_offset in range(num_days):
        curr_date = dateutil.parser.parse(min_date_str) + datetime.timedelta(days=day_offset)
        curr_date_str = str(curr_date).split(" ")[0]
        
        try:
            curr_date_row = cd_df.loc[cd_df["date"] == curr_date_str].to_dict('records')[0]
            prev_date_row = curr_date_row
        except:
            prev_date_row['date'] = curr_date_str
            prev_date_row['num_sites'] = 0
            cd_df = cd_df.append(prev_date_row, ignore_index=True)
    
    all_date_country_df = all_date_country_df.append(cd_df, ignore_index=True)

all_date_country_df["date"] = all_date_country_df["date"].apply(convert_date)
df = all_date_country_df

## Subtract severe patients from all patients to get the "never severe"-like count

In [None]:
country_color_map = get_country_color_map(merge_single_site_countries=MERGE_SINGLE_SITE_COUNTRIES)

In [None]:
if MERGE_SINGLE_SITE_COUNTRIES:
    df["country"] = df["country"].apply(merge_single_site_country_name)

country_sum_df = df.groupby(["country", "date"]).sum().reset_index()
country_sum_df.head()
COUNTRIES = country_sum_df["country"].unique().tolist()

In [None]:
country_sum_df["num_patients_in_hospital_on_this_date_minus_severe"] = df["num_patients_in_hospital_on_this_date"] - df["num_patients_in_hospital_and_severe_on_this_date"]
country_sum_df["cumulative_patients_all_minus_severe"] = df["cumulative_patients_all"] - df["cumulative_patients_severe"]

## Obtain country-level daily counts from JHU CSSE

In [None]:
jhu_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/2335ac3d04721e2f7285e320badc2c4c6c416894/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
jhu_df = pd.read_csv(jhu_url)

jhu_df = jhu_df.rename(columns={"Country/Region": "country", "Province/State": "state"})
jhu_df = jhu_df.drop(columns=["Lat", "Long"])

# Countries have different ids in the JHU data than in the 4CE data
country_map = {
    "US": "USA",
    "United Kingdom": "UK"
}

jhu_df["country"] = jhu_df["country"].apply(lambda c: country_map[c] if c in country_map else c)
jhu_df = jhu_df.loc[~pd.notna(jhu_df["state"])]
jhu_df = jhu_df.drop(columns=["state"])
if MERGE_SINGLE_SITE_COUNTRIES:
    jhu_df["country"] = jhu_df["country"].apply(merge_single_site_country_name)
jhu_df = jhu_df.loc[jhu_df["country"].isin(COUNTRIES)]

jhu_df = jhu_df.melt(id_vars=["country"], var_name="date", value_name="cumulative_count")
jhu_df["date"] = jhu_df["date"].astype(str)
jhu_df = jhu_df.groupby(["country", "date"]).sum().reset_index()
jhu_df["date"] = jhu_df["date"].apply(convert_date)
jhu_df = jhu_df.sort_values(by="date", ascending=True)
jhu_df = jhu_df.loc[(jhu_df["date"] >= min_date) & (jhu_df["date"] <= max_date)]
jhu_df["date_str"] = jhu_df["date"].astype(str)

jhu_df_freeze = jhu_df.copy()

jhu_roc_df = pd.DataFrame(index=[], data=[], columns=["country", "date", "cumulative_count"])
for country, country_df in jhu_df.groupby("country"):
    country_df = country_df.copy()
    country_df["count"] = np.concatenate((np.array([np.nan]), np.diff(country_df["cumulative_count"].values)))
    country_df["cumulative_count"] = country_df["cumulative_count"].replace(0, np.nan)
    
    country_df["N0"] = country_df["cumulative_count"].shift(1) # N0 is the total case up to the day before
    country_df["n1"] = country_df["count"] # n1 is the case number this day
    country_df["n2"] = country_df["n1"].shift(1) # n2 is the case number yesterday
    
    country_df["percent_increase"] = (country_df["n1"] / country_df["N0"]) * 100
    
    country_df['R'] = country_df["percent_increase"] # TODO: is this correct?
    # TODO: update CI formula
    country_df['C'] = country_df['R'] - 1
    country_df['standard_error'] = country_df.apply(lambda obs: (obs['R']+np.power(obs['R'], 2))/obs['n2'], axis='columns')
    country_df['95_CI_below'] = country_df.apply(lambda obs: obs['C'] - 1.96*np.sqrt(obs['standard_error']), axis='columns')
    country_df['95_CI_above'] = country_df.apply(lambda obs: obs['C'] + 1.96*np.sqrt(obs['standard_error']), axis='columns')
    country_df = country_df.replace([np.inf, -np.inf], np.nan)


    jhu_roc_df = jhu_roc_df.append(country_df, ignore_index=True)
jhu_roc_df.head()

In [None]:
def get_jhu_cumulative_count(date_str, country):
    try:
        return jhu_roc_df.loc[(jhu_roc_df["date_str"] == date_str) & (jhu_roc_df["country"] == country)].reset_index().iloc[0]["cumulative_count"]
    except:
        return 0
# Start plotting after country has 100 cases
count_threshold = 100
jhu_roc_df["jhu_past_100"] = jhu_roc_df["cumulative_count"] >= count_threshold

In [None]:
# Transform the 4CE data to obtain normalized change values.
def compute_change_4ce(df_dc, cumulative_count_colname, daily_count_colname, category):
    df_dc["cumulative_count"] = df_dc[cumulative_count_colname]
    df_dc["count"] = df_dc[daily_count_colname]
    
    # Sort dates
    df_dc["date"] = df_dc["date"].astype(str)
    df_dc["date"] = df_dc["date"].apply(convert_date)
    df_dc = df_dc.sort_values(by="date", ascending=True)
    df_dc = df_dc.loc[(df_dc["date"] >= min_date) & (df_dc["date"] <= max_date)]
    df_dc["date_str"] = df_dc["date"].astype(str)

    df_dc_freeze = df_dc.copy()

    dc_roc_df = pd.DataFrame(index=[], data=[], columns=["country", "date"])
    for country, country_df in df_dc.groupby("country"):
        country_df = country_df.copy()

        country_df["N0"] = country_df["cumulative_count"].shift(1) # N0 is the total case up to the day before
        country_df["n1"] = country_df["count"] # n1 is the case number this day
        country_df["n2"] = country_df["n1"].shift(1) # n2 is the case number yesterday

        country_df["percent_increase"] = (country_df["n1"] / country_df["N0"]) * 100

        country_df['R'] = country_df["percent_increase"] # TODO: is this correct?
        # TODO: update CI formula
        country_df['C'] = country_df['R'] - 1
        country_df['standard_error'] = country_df.apply(lambda obs: (obs['R']+np.power(obs['R'], 2))/obs['n2'], axis='columns')
        country_df['95_CI_below'] = country_df.apply(lambda obs: obs['C'] - 1.96*np.sqrt(obs['standard_error']), axis='columns')
        country_df['95_CI_above'] = country_df.apply(lambda obs: obs['C'] + 1.96*np.sqrt(obs['standard_error']), axis='columns')
        country_df = country_df.replace([np.inf, -np.inf], np.nan)

        dc_roc_df = dc_roc_df.append(country_df, ignore_index=True)
    dc_roc_df.head()
    dc_roc_df = dc_roc_df[["country", "date", "num_sites", "cumulative_count", "count", "date_str", "N0", "n1", "n2", "percent_increase", "R", "C", "standard_error", "95_CI_below", "95_CI_above"]]
    dc_roc_df["category"] = category
    dc_roc_df["jhu_past_100"] = dc_roc_df.apply(lambda row: get_jhu_cumulative_count(row["date_str"], row["country"]) >= count_threshold, axis='columns')
    return dc_roc_df

In [None]:
dc_roc_df_all = compute_change_4ce(country_sum_df, "cumulative_patients_all", "num_patients_in_hospital_on_this_date", "All")
dc_roc_df_severe = compute_change_4ce(country_sum_df, "cumulative_patients_severe", "num_patients_in_hospital_and_severe_on_this_date", "Severe")
dc_roc_df_all_minus_severe = compute_change_4ce(country_sum_df, "cumulative_patients_all_minus_severe", "num_patients_in_hospital_on_this_date_minus_severe", "All minus Severe")

dc_roc_df = dc_roc_df_all.append(dc_roc_df_severe, ignore_index=True).append(dc_roc_df_all_minus_severe, ignore_index=True)
dc_roc_df.to_csv("dc_roc_df.csv")

### Transform data for plots faceted by country

In [None]:
jhu_roc_df = jhu_roc_df.copy()
dc_roc_df = dc_roc_df.copy()
jhu_roc_df["source"] = "JHU CSSE"
dc_roc_df["source"] = "4CE"

In [None]:
join_df = jhu_roc_df.append(dc_roc_df, ignore_index=True)
join_df["country_source"] = join_df.apply(lambda row: row["country"] + "_" + row["source"], axis='columns')
join_df.head()

## Normalized New Daily Cases

First, obtain total hospital discharges for each country.

$\texttt{country_total} = \text{country total in-patient-discharge}$

$\texttt{country_4CE_total} = \text{total in-patient-discharge in our sites within that country}$

$F0 = \frac{\texttt{country_total}}{\texttt{country_4CE_total}}$

$F0$ is used to normalize.

- For new figure that shows daily case # per 100K, we will instead show 
    - $\texttt{RATE} = \texttt{N_case} * F1$
    - where $F1 = F0 * \frac{100K}{\texttt{country population}}$
    - then the standard error for $\texttt{RATE}$ will be $\sqrt(\texttt{RATE}*F1)$ and the confidence interval will be $\texttt{RATE} \pm 1.96*\sqrt(\texttt{RATE}*F1)$

In [None]:
# Get daily new cases from cumulative_count
norm_jhu = jhu_roc_df.copy()

norm_jhu["count"] = norm_jhu["cumulative_count"].diff()

norm_jhu.loc[norm_jhu["date"] == "2020-01-28", "count"] = np.nan # Make sure the start count is NaN

norm_jhu.head()

In [None]:
norm_4ce = dc_roc_df.copy()

## Compute F0 value for each country using COUNTRY_POPULATION numbers

In [None]:
country_sites_df = sites_df.groupby("country").sum().reset_index()
country_sites_df = country_sites_df[["country", "adult_num_hosp", "adult_num_beds", "adult_num_yearly_discharge"]]
country_sites_df = country_sites_df.rename(columns={
    "adult_num_hosp": "4ce_num_hosp",
    "adult_num_beds": "4ce_num_beds",
    "adult_num_yearly_discharge": "4ce_num_yearly_discharge",
})
country_sites_df["country_num_yearly_discharge_per_100000"] = country_sites_df["country"].apply(lambda c: COUNTRY_HOSP_DISCHARGE[c])
country_sites_df["country_population"] = country_sites_df["country"].apply(lambda c: COUNTRY_POPULATION[c])

if MERGE_SINGLE_SITE_COUNTRIES:
    country_sites_df["country"] = country_sites_df["country"].apply(merge_single_site_country_name)

country_sites_df = country_sites_df.groupby(["country"]).sum().reset_index()

country_sites_df["F1"] = country_sites_df["country_num_yearly_discharge_per_100000"] / country_sites_df["4ce_num_yearly_discharge"]
country_sites_df["F0"] = country_sites_df["F1"] * (country_sites_df["country_population"] / 100000)
country_sites_df

In [None]:
F0 = dict(zip(country_sites_df["country"].values.tolist(), country_sites_df["F0"].values.tolist()))
F0

In [None]:
# Append the F0 values to each df
norm_jhu["population"] = norm_jhu["country"].apply(lambda x: COUNTRY_POPULATION[x])
norm_jhu["F0"] = norm_jhu["country"].apply(lambda x: F0[x])

norm_4ce["population"] = norm_4ce["country"].apply(lambda x: COUNTRY_POPULATION[x])
norm_4ce["F0"] = norm_4ce["country"].apply(lambda x: F0[x])



In [None]:
# Compute adjusted counts
norm_jhu["adjusted_count"] = norm_jhu["count"]
norm_jhu["F1"] = 100000 / norm_jhu["population"]
norm_jhu["RATE"] = norm_jhu["count"] * norm_jhu["F1"]
norm_jhu["RATE_7_day_avg"] = norm_jhu["RATE"].rolling(7).mean().shift(-3)
norm_jhu["std_error"] = norm_jhu["F1"] * norm_jhu["RATE"]
norm_jhu["std_error"] = norm_jhu["std_error"].apply(lambda x: np.sqrt(x))
norm_jhu["ci_above"] = norm_jhu["RATE_7_day_avg"] + 1.96 * norm_jhu["std_error"]
norm_jhu["ci_below"] = norm_jhu["RATE_7_day_avg"] - 1.96 * norm_jhu["std_error"]

norm_4ce["adjusted_count"] = norm_4ce["F0"] * norm_4ce["count"]
norm_4ce["F1"] = norm_4ce["F0"] * 100000 / norm_4ce["population"]
norm_4ce["RATE"] = norm_4ce["count"] * norm_4ce["F1"]
for c in COUNTRIES:
    c_filter = norm_4ce["country"] == c
    norm_4ce.loc[c_filter, "RATE_7_day_avg"] = norm_4ce.loc[c_filter, "RATE"].rolling(7).mean().shift(-3)
# norm_4ce["RATE_7_day_avg"] = norm_4ce["RATE"].rolling(7).mean().shift(-3)
norm_4ce["std_error"] = norm_4ce["F1"] * norm_4ce["RATE"]
norm_4ce["std_error"] = norm_4ce["std_error"].apply(lambda x: np.sqrt(x))
norm_4ce["ci_above"] = norm_4ce["RATE_7_day_avg"] + 1.96 * norm_4ce["std_error"]
norm_4ce["ci_below"] = norm_4ce["RATE_7_day_avg"] - 1.96 * norm_4ce["std_error"]

In [None]:
norm_jhu_min_col = norm_jhu[['country', 'date', 'count', 'adjusted_count', 'RATE_7_day_avg', 'source', 'population', 'F0', 'F1', 'RATE', 'std_error', 'ci_above', 'ci_below', 'jhu_past_100']]
norm_4ce_min_col = norm_4ce[['country', 'date', 'count', 'adjusted_count', 'RATE_7_day_avg', 'source', 'population', 'F0', 'F1', 'RATE', 'std_error', 'ci_above', 'ci_below', 'jhu_past_100', 'num_sites', 'category']]

norm_jhu_min_col.head(10)
norm_4ce_min_col.head(10)

## Temporary: select the All category of 4CE only (ignoring severity)

In [None]:
norm_4ce_min_col = norm_4ce_min_col.loc[norm_4ce_min_col["category"] == "Severe"]

In [None]:
norm_df = norm_jhu_min_col.append(norm_4ce_min_col, ignore_index=True)

norm_df["country_source"] = norm_df.apply(lambda row: row["country"] + "_" + row["source"], axis='columns')

norm_df.to_csv("norm_df.csv")

norm_fce_df = norm_df.loc[norm_df['source'] == '4CE'].copy()

norm_df

## Temporary: drop Spain since so few discharges per year, it causes the rates to be way off

In [None]:
norm_df = norm_df.loc[norm_df["country"] != "Spain"]
norm_fce_df = norm_fce_df.loc[norm_fce_df["country"] != "Spain"]

In [None]:
min_date_3 = min_date + datetime.timedelta(days=3)
max_date_3 = max_date - datetime.timedelta(days=3)
norm_df = norm_df.loc[(norm_df["jhu_past_100"]) & (norm_df["date"] >= min_date_3) & (norm_df["date"] <= max_date_3)]

In [None]:
title = "Country-Level Positive Case Rate, Comparison to JHU CSSE Data"

# Selection
source_selection = alt.selection_multi(fields=["source"], bind="legend")

min_date = norm_df["date"].min()
max_date = norm_df["date"].max()
norm_fce_df = norm_fce_df.loc[(norm_fce_df["date"] >= min_date) & (norm_fce_df["date"] <= max_date)]

# Domains
date_domain = [alt.DateTime(year=min_date.year, month=min_date.month, date=min_date.day), alt.DateTime(year=max_date.year, month=max_date.month, date=max_date.day)]
sites_domain = [0, norm_fce_df["num_sites"].max() + 1]
patients_domain = [0, norm_fce_df["count"].max() + 1]
rate_domain = [0, norm_fce_df["RATE_7_day_avg"].max() + 1]

country_names = COUNTRIES
COUNTRY_COLORS = [country_color_map[c] for c in country_names]
country_source_names = [c + "_" + "4CE" for c in country_names] + [c + "_" + "JHU CSSE" for c in country_names]
color_scale = alt.Scale(domain=country_names, range=COUNTRY_COLORS)
join_color_scale = alt.Scale(domain=country_source_names, range=COUNTRY_COLORS + ["#707070"] * len(country_names))

country_width = 170

nearest = alt.selection_single(encodings=['x', 'y'], on="mouseover", nearest=True, empty="none", clear="mouseout")
y_selection = alt.selection_interval(encodings=["y"], bind="scales", init={"y": rate_domain})
date_brush = alt.selection(type='interval', encodings=['x'])

# Additional Visual Elements
tooltip = [
    alt.Tooltip("source", title="Data source"),
    alt.Tooltip("country", title="Country"),
    alt.Tooltip("count", title="Daily Cases"),
    alt.Tooltip("adjusted_count", title="Adjusted Daily Cases"),
    alt.Tooltip("RATE_7_day_avg", title="Daily Case Rate, 7-day Average", format=".2f"),
    alt.Tooltip("date", title="Date"),
    alt.Tooltip("ci_below", title="95% CI upper bound", format=".2f"),
    alt.Tooltip("ci_above", title="95% CI lower bound", format=".2f")
]

rule = alt.Chart().mark_rule(color="red", size=0.5).encode(
    x="date:T"
).transform_filter(
    nearest
)

line = alt.Chart(norm_df).transform_filter(source_selection).mark_line(opacity=0.7).encode(
    x=alt.X("date:T", title=None, axis=alt.Axis(labelBound=True), scale=alt.Scale(padding=5)),
    y=alt.Y("RATE_7_day_avg:Q", axis=alt.Axis(title="Adjusted daily case rate, 7 day average"), scale=alt.Scale(zero=False, nice=False, domain=rate_domain, padding=5)),
    strokeDash=alt.StrokeDash("source:N", scale=alt.Scale(domain=["4CE", "JHU CSSE"], range=[[0,0], [3,3]]), 
    legend=alt.Legend(title="Data Source")),
    color=alt.Color("country_source:N", scale=join_color_scale, legend=None),
    tooltip=tooltip
).properties(width=country_width, height=200)

errorband = line.transform_filter(alt.datum["source"] == "4CE").mark_errorband().encode(
    x=alt.X(f"date:T", title=None, axis=alt.Axis(labelBound=True)),
    y=alt.Y(f"sum(ci_below):Q", title=""),
    y2=alt.Y2(f"sum(ci_above):Q", title=""),
    color=alt.Color(f"country:N", scale=color_scale, legend=alt.Legend(title=None)),
    tooltip=tooltip
)

circle = (
    line.mark_circle()
        .encode(
            size=alt.condition(~nearest, alt.value(5), alt.value(30))
        )
        .add_selection(nearest)
)

num_sites_bar_bg = (
    alt.Chart(norm_fce_df)
        .mark_bar(size=2)
        .encode(
            x=alt.X("date:T", scale=alt.Scale(domain=date_domain, padding=5), title=None, axis=alt.Axis(labelBound=True)),
            y=alt.Y("num_sites:Q", axis=alt.Axis(title="# of sites"), scale=alt.Scale(domain=sites_domain)),
            color=alt.value("gray"),
            tooltip=tooltip
        )
        .properties(width=country_width, height=60) 
)

num_sites_bar = (
    num_sites_bar_bg
        .encode(
            color=alt.Color("country:N", scale=color_scale, legend=None),
        )
        .transform_filter(date_brush)
)

num_patients_bar_bg = (
    alt.Chart(norm_fce_df)
        .mark_bar(size=2)
        .encode(
            x=alt.X("date:T", scale=alt.Scale(domain=date_domain, padding=5), title=None, axis=alt.Axis(labelBound=True)),
            y=alt.Y("count:Q", axis=alt.Axis(title="# of new cases"), scale=alt.Scale(domain=patients_domain)),
            color=alt.value("gray"),
            tooltip=tooltip
        )
        .properties(width=country_width, height=60) 
)

num_patients_bar = (
    num_patients_bar_bg
        .encode(
            color=alt.Color("country:N", scale=color_scale, legend=None),
        )
        .transform_filter(date_brush)
)

top = (
    alt.layer(line, errorband, circle, rule, data=norm_df)
        .facet(
            column=alt.Column("country:N"), bounds="flush" #header=alt.Header(labels=False)
        )
        .add_selection(y_selection)
        .transform_filter(date_brush)
)

num_sites_bottom = (
    alt.layer(num_sites_bar_bg, num_sites_bar, rule, data=norm_fce_df)
        .facet(
            column=alt.Column("country:N", header=alt.Header(labels=False)), bounds="flush"
        )
        .add_selection(nearest)
        .add_selection(date_brush)
)

num_patients_bottom = (
    alt.layer(num_patients_bar_bg, num_patients_bar, rule, data=norm_fce_df)
        .facet(
            column=alt.Column("country:N", header=alt.Header(labels=False)), bounds="flush"
        )
        .add_selection(nearest)
        .add_selection(date_brush)
)

plot = (
    alt.vconcat(top, num_patients_bottom, num_sites_bottom, spacing=5)
        .resolve_scale(color="shared", x="independent")
        .properties(title={
                "text": title, 
                "subtitle": get_visualization_subtitle(data_release=DATA_RELEASE, cohort=COHORT, num_sites=NUM_SITES),
                "subtitleColor": "gray",
                "dx": 60
        })
        .add_selection(source_selection)
)


plot = apply_theme(
    plot, 
    axis_label_font_size=10, 
    axis_title_font_size=12, 
    axis_title_padding=8, 
    legend_orient="bottom", 
    legend_symbol_type="stroke",
    legend_title_orient="left",
    legend_title_font_size=14,
    label_font_size=12
).configure_header(title=None, labelPadding=3, labelFontSize=13)

for_website(plot, "Daily Count", "country-level rate of positive cases")

plot

In [None]:
norm_fce_df.to_csv("norm_fce_df.csv")