In [2]:
import sys
sys.path.append("../")

In [15]:
import load
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd

%matplotlib inline

from bokeh.io import output_notebook, export_svgs
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, ColumnDataSource
output_notebook()


# New Cases Data

In [4]:
cases_df = load.load_confirmed_by_region("../data/usa/data_cases.csv")

In [220]:
def compute_rolling_mean_minus_var(cases_df, days=7):
    """Returns rolling mean - rolling var per county.
        Aligned by days since first case
    """
    cases = []

    for region in cases_df.columns:
        cases.append(cases_df[region].replace(0, np.nan).dropna().values)
    
    region_to_cases = {
        region: pd.Series(cases[i]) for i, region in enumerate(cases_df.columns)
    }
    cases_since_start_df = pd.DataFrame(region_to_cases)
    new_cases_since_start_df = cases_since_start_df.diff().iloc[1:,:]
    rolling_df = new_cases_since_start_df.rolling(days).mean() - new_cases_since_start_df.rolling(days).var()
    # drop first 7 days for which there is no rolling mean or var
    rolling_df = rolling_df.iloc[days:, :]
    rolling_df = rolling_df.reset_index(drop=True)
    return rolling_df
    
def average_by_state(rolling_df):
    """average rolling mean - rolling var by state
    Return new dataframe with states and day columns
    """
    states = [region.split(",")[1].strip() for region in rolling_df.columns]
    states_df = rolling_df.copy().T
    states_df["region"] = states
    states_df = states_df.groupby("region").mean()
    states_df = states_df.stack().reset_index()
    states_df = states_df.rename(columns={
        states_df.columns[1]: "Days (since first case)", 
        states_df.columns[2]: "7-day Rolling(Mean) - Rolling(Var)", 
    })
    return states_df

In [221]:
rolling_df = compute_rolling_mean_minus_var(cases_df)

# State Average: Rolling(Mean) - Rolling(Var) 

In [222]:
states_df = average_by_state(rolling_df)

In [251]:
#fig = px.line(states_df, 
#             x="Days (since first case)",
#             y="7-day Rolling(Mean) - Rolling(Var)",
#             color="state")

#fig.update_layout(title_text="Average State Dispersion")
#
#fig.show()
def plot_overdispersion(df, title, width=300, height=550):
    p = figure(
        plot_height=height, 
        plot_width=width, 
        title=title, 
        tools="save,hover",
        y_range=df['region'].unique(),
        #x_axis_label='Days since first confirmed case', 
        x_axis_label='7-day Var(x) - Mean(x)',
        x_axis_type='log',
    )
    p.circle(y='region', x='y', size=5, alpha=0.4, source=ColumnDataSource(df))
    #p.xaxis.major_label_orientation = np.pi/3
    p.output_backend = 'svg'
    return p
states_df = states_df.loc[states_df.iloc[:, 2] < -100]
states_df['y'] = np.abs(states_df.iloc[:, 2].values)
p = plot_overdispersion(states_df, 'Overdispersion - US States')
show(p)

# Top 20 Counties (by cumulative cases)

In [226]:
def stack_counties(rolling_df):
    counties_df = rolling_df.stack().reset_index()
    counties_df = counties_df.rename(columns={
        counties_df.columns[0]: "Days (since first case)", 
        counties_df.columns[1]: "region", 
        counties_df.columns[2]: "7-day Rolling(Mean) - Rolling(Var)", 
    })
    return counties_df

In [249]:
top_20_counties = list(cases_df.iloc[-1, :].sort_values().index[-len(states_df['region'].unique()):])
top_20_rolling_df = compute_rolling_mean_minus_var(cases_df[top_20_counties])
top_20_counties_df = stack_counties(top_20_rolling_df)

In [252]:
#fig = px.line(top_20_counties_df, 
#             x="Days (since first case)",
#             y="7-day Rolling(Mean) - Rolling(Var)",
#             color="county")
#
#fig.update_layout(title_text="Dispersion: Top 20 Counties (by cumulative cases)")
#
#fig.show()
counties_df = top_20_counties_df.loc[top_20_counties_df.iloc[:, 2] < -100]
counties_df['y'] = np.abs(counties_df.iloc[:, 2].values)
p = plot_overdispersion(counties_df, 'Overdispersion - US Counties', width=400)
show(p)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Median 20 Counties (by cumulative cases)

In [13]:
median = len(cases_df.columns) // 2
median_20_counties = list(cases_df.iloc[-1, :].sort_values().index[median - 10: median + 10])
median_20_rolling_df = compute_rolling_mean_minus_var(cases_df[median_20_counties])
median_20_counties_df = stack_counties(median_20_rolling_df)

In [14]:
fig = px.line(median_20_counties_df, 
             x="Days (since first case)",
             y="7-day Rolling(Mean) - Rolling(Var)",
             color="county")

fig.update_layout(title_text="Dispersion: Median 20 Counties (by cumulative cases)")

fig.show()