In [125]:
from pathlib import Path
import pandas as pd
import os

from delphi_nhsn.constants import SIGNALS_MAP, PRELIM_SIGNALS_MAP
from delphi_epidata import Epidata
from epiweeks import Week
CSV_PATH = Path(os.path.abspath('')).parent.joinpath('receiving')
SIGNALS = list(SIGNALS_MAP.keys()) + list(PRELIM_SIGNALS_MAP.keys())

In [None]:
def get_nhsn_df(geo, pathogen, prelim=False):
    lst = []
    geo_filter = f'*{geo}'
    pathogen_filter = f'confirmed_admissions_{pathogen}.csv'
    prelim_filter = f'prelim_' if prelim else ""
    filter = f"{geo_filter}_{prelim_filter}{pathogen_filter}"
    for files in Path(CSV_PATH).glob(filter):
        filename_parts = files.name.split('_')
        signal = "_".join(filename_parts[3:]).replace('.csv', '')
        time_value = filename_parts[1]
        df = pd.read_csv(files)
        df['signal'] = signal
        df['time_value'] = time_value
        if geo == "nation":
            df["geo_id"] = "US"
        else:
            df["geo_id"] = df["geo_id"].str.upper()
        df.time_value = df.time_value.astype(str)
        lst.append(df)
    df = pd.concat(lst)
    df = df.drop(columns=['se','sample_size'], axis=1)
    return df

In [None]:
def get_hhs_df(geo, pathogen):
    response = Epidata.covidcast("hhs", f"confirmed_admissions_{pathogen}_1d",
                      geo_type=geo, time_values=Epidata.range("20200808", "20240426"),
                              geo_value="*", as_of=None, time_type="day")
    df = pd.DataFrame.from_dict(response["epidata"])
    df.rename(columns={"geo_value": "geo_id", "value": "val", "stderr": "se"}, inplace=True)
    df = df[["geo_id", "time_value", "val", "signal"]]
    df["time_value"] = pd.to_datetime(df["time_value"], format="%Y%m%d")
    # aggregating over 7 days matching epiweek for nhsn
    df["week_time"] = df["time_value"].apply(lambda x: Week.fromdate(x))
    df = df.groupby(["geo_id", "signal", "week_time"]).val.sum().reset_index()
    df["geo_id"] = df["geo_id"].str.upper()
    df.rename({"week_time": "time_value"}, axis=1, inplace=True)
    df.time_value = df.time_value.astype(str)
    return df


In [None]:
def generate_spearman(nhsn_df, hhs_df, slice_col):
    
    joined = pd.merge(nhsn_df, hhs_df, on=["geo_id", "time_value"], how="left", suffixes=["_nhsn", "_hhs"])
    
    spearmanr_df = joined.groupby(slice_col)[["val_nhsn", "val_hhs"]].corr("spearman").iloc[0::2,-1].reset_index()
    spearmanr_df.rename({"val_hhs": "correlation"}, axis=1, inplace=True)
    spearmanr_df = spearmanr_df[[slice_col, "correlation"]]
    spearmanr_df[slice_col] = spearmanr_df[slice_col].str.upper()
    return spearmanr_df
    

In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)

def plot_spearman_choropleth(nhsn_df, hhs_df, pathogen, download=False):
    spearmanr = generate_spearman(nhsn_df, hhs_df, "geo_id")

    fig = go.Figure(data=go.Choropleth(
        locations=spearmanr['geo_id'], # Spatial coordinates
        z = spearmanr['correlation'].astype(float), # Data to be color-coded
        locationmode = 'USA-states', # set of locations match entries in `locations`
        colorscale = 'Viridis',
        colorbar_title = "spearman rank",
    ))
    
    fig.update_layout(
        title_text = f'Spearman Correlations between HHS {pathogen} admission and NHSN {pathogen} admission',
        geo_scope='usa', # limite map scope to USA
    )
    
    if download:
        pio.write_image(fig, f'{pathogen}_spearman_choropleth.pdf', format='pdf')
    
    else:
        fig.show()


In [None]:
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
import plotly.io as pio

init_notebook_mode(connected=True)

def plot_timeplot(df, pathogen, y_col, state="", prelim_flag=False, download=False):
    if y_col == "correlation":
        title = f'Spearman Correlations between {pathogen} HHS and NHSN over time'
        filename = f'{pathogen}_spearman_timeplot.pdf'
    else:
        state_space = state + " " if state else ""
        title = f'{state_space}NHSN value over time'
        state = state + "_" if state else ""
        filename = f'{state}{pathogen}_nhsn_timeplot.pdf'
    
    if prelim_flag:
        filename += " (prelim data)"
        title += " (prelim data)"
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df['time_value'], y=df[y_col], mode='lines', name='Time Series'))
    fig.update_layout(title=title)

    
    if download:
        pio.write_image(fig, filename, format='pdf')
    else:
        fig.show()

In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)

geo_type = "state"
pathogen = "covid"
nhsn_df = get_nhsn_df(geo_type, pathogen)

def plot_choropleth_overtime(df, pathogen, z_col):
    color_bar_title = ""
    layout_title = "" 
    if z_col == "val" :
        color_bar_title = f'Total Confirmed {pathogen} Cases'
        layout_title = f'NHSN total confirmed {pathogen} cases' 

    elif z_col == "diff_percentage":
        color_bar_title = "difference in percentage"
        layout_title = f'Proportion between NHSN and HHS for confirmed {pathogen} over time'
        
    df["time_value"] = pd.to_datetime(df.time_value.map(lambda x: str(x)+'-0'), format="%Y%W-%w")
    
    data_slider = []
    
    for time_value in df['time_value'].unique():
        df_segmented =  df[(df['time_value']== time_value)]
    
        data_each_yr = dict(
                            type='choropleth',
                            locations = df_segmented['geo_id'],
                            z=df_segmented[z_col].astype(float),
                            locationmode='USA-states',
                            colorscale = 'Viridis',
                            colorbar= {'title': color_bar_title},)
    
        data_slider.append(data_each_yr)
    
    steps = []
    for idx, time_value in enumerate(sorted(df['time_value'].unique())):
        step = {
                    "method":'restyle',
                    "args":[
                        {'visible': [False] * len(data_slider)}
                    ],
                    "label": f"Epiweek {Week.fromdate(pd.to_datetime(time_value))}"
                }
        step['args'][0]["visible"][idx] = True
        steps.append(step)
    
    sliders = [dict(active=0, pad={"t": 1}, steps=steps)]
    
    layout = dict(title =layout_title, geo=dict(scope='usa',
                           projection={'type': 'albers usa'}),
                  sliders=sliders)
    
    fig = go.Figure(data=data_slider, layout=layout)
    
    fig.show()


In [130]:
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
import plotly.io as pio

init_notebook_mode(connected=True)

def plot_compare_timeplot(nhsn_df, hhs_df, pathogen, geo_val="", download=False, val_cutoff=100, percent_cutoff=20):
    
    title = f'{geo_val} NHSN vs HHS value over time with (diff > {percent_cutoff}% and val > {val_cutoff}) markers'
    filename = f'{geo_val}_{pathogen}_nhsn_vs_hhs_timeplot.pdf'

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=nhsn_df['time_value'], y=nhsn_df["val"], mode='lines', name='NHSN Time Series'))
    fig.add_trace(go.Scatter(x=hhs_df['time_value'], y=hhs_df["val"], mode='lines', name='HHS Time Series'))
    
    joined = pd.merge(nhsn_df, hhs_df, on=["geo_id", "time_value"], how="left", suffixes=["_nhsn", "_hhs"])
    joined = joined[["geo_id", "time_value", "val_nhsn", "val_hhs"]]
    joined["diff_percentage"] = ((joined["val_nhsn"] - joined["val_hhs"]) / joined["val_hhs"]) * 100
    percent_filter = (abs(joined["diff_percentage"]) > percent_cutoff)
    num_filter = (joined['val_nhsn'] > val_cutoff)
    sus_data = joined[(percent_filter & num_filter)]
    sus_data.sort_values(by="time_value", inplace=True)
    
    fig.add_trace(go.Scatter(x=sus_data['time_value'], y=sus_data["val_nhsn"] , mode='markers', name=f'NHSN points', marker={"color":"green"}))
    fig.add_trace(go.Scatter(x=sus_data['time_value'], y=sus_data["val_hhs"] , mode='markers', name=f'HHS points', marker={"color":"green"}))

    fig.update_layout(title=title)
    
    if download:
        pio.write_image(fig, filename, format='pdf')
    else:
        fig.show()


# all analysis download

In [None]:
from itertools import product
geos = ["nation", "state", "hhs"]
hhs_pathogen_mapping = {"flu": "influenza", "covid": "covid"}
geo_pathogen_flag_combo = list(product(geos, hhs_pathogen_mapping.keys(), [True, False]))

for geo, pathogen, prelim_flag in geo_pathogen_flag_combo:
    nhsn_df = get_nhsn_df(geo, pathogen, prelim_flag)
    hhs_df = get_hhs_df(geo, hhs_pathogen_mapping[pathogen])
    spearmanr = generate_spearman(nhsn_df, hhs_df, "time_value")
    joined = pd.merge(nhsn_df, hhs_df, on=["geo_id", "time_value"], how="left", suffixes=["_nhsn", "_hhs"])
    joined = joined[["geo_id", "time_value", "val_nhsn", "val_hhs"]]
    joined["diff_percentage"] = ((joined["val_nhsn"] - joined["val_hhs"]) / joined["val_hhs"]) * 100
    
    plot_choropleth_overtime(joined, pathogen, z_col="diff_percentage")
    plot_timeplot(spearmanr, pathogen, y_col="correlation", download=True)
    plot_timeplot(nhsn_df, pathogen,y_col="val", download=True)
    plot_timeplot(joined, pathogen,y_col="diff_percentage", download=True)
    plot_spearman_choropleth(nhsn_df, hhs_df, "time_value", download=True)
    if geo != "nation":
        geos = list(nhsn_df.geo_id.unique())
        for geo_val in geos[:2]:
            part_nhsn_df = nhsn_df[nhsn_df["geo_id"] == geo_val].sort_values(by="time_value")
            part_hhs_df = hhs_df[hhs_df["geo_id"] == geo_val].sort_values(by="time_value")
            plot_compare_timeplot(part_nhsn_df, part_hhs_df, pathogen, geo_val=geo_val, download=True)
    else:
        plot_compare_timeplot(nhsn_df, hhs_df, pathogen, geo_val="us", download=True)

# Spearman over time example

In [None]:
from itertools import product

pathogen_mapping = {"flu": "influenza", "covid": "covid"}
pathogen = "covid"
prelim_flag = False
geo = "state"

nhsn_df = get_nhsn_df(geo, pathogen, prelim_flag)
hhs_df = get_hhs_df(geo, pathogen_mapping[pathogen])
spearmanr = generate_spearman(nhsn_df, hhs_df, "time_value")
plot_timeplot(spearmanr, pathogen, y_col="correlation", download=False)

# Spearman over geo

In [None]:
pathogen_mapping = {"flu": "influenza", "covid": "covid"}
pathogen = "covid"
prelim_flag = False
geo = "state"

nhsn_df = get_nhsn_df(geo, pathogen, prelim_flag)
hhs_df = get_hhs_df(geo, pathogen_mapping[pathogen])
plot_spearman_choropleth(nhsn_df, hhs_df, pathogen, download=False)

# comparison over time across geo

In [None]:
pathogen_mapping = {"flu": "influenza", "covid": "covid"}
pathogen = "covid"
prelim_flag = False
geo = "state"

nhsn_df = get_nhsn_df(geo, pathogen, prelim_flag)
hhs_df = get_hhs_df(geo, pathogen_mapping[pathogen])
joined = pd.merge(nhsn_df, hhs_df, on=["geo_id", "time_value"], how="left", suffixes=["_nhsn", "_hhs"])
joined = joined[["geo_id", "time_value", "val_nhsn", "val_hhs"]]
joined["diff_percentage"] = ((joined["val_nhsn"] - joined["val_hhs"]) / joined["val_hhs"]) * 100
plot_choropleth_overtime(joined, pathogen, z_col="diff_percentage")

# compare val over time

In [131]:
pathogen_mapping = {"flu": "influenza", "covid": "covid"}
pathogen = "covid"
prelim_flag = False
geo = "state"

nhsn_df = get_nhsn_df(geo, pathogen).sort_values(by="time_value")
hhs_df = get_hhs_df(geo, pathogen_mapping[pathogen]).sort_values(by="time_value")
geos = list(nhsn_df.geo_id.unique())
for geo_val in geos[:2]:
    state_nhsn_df = nhsn_df[nhsn_df["geo_id"] == geo_val].sort_values(by="time_value")
    state_hhs_df = hhs_df[hhs_df["geo_id"] == geo_val].sort_values(by="time_value")
    plot_compare_timeplot(state_nhsn_df, state_hhs_df, pathogen, geo_val=geo_val, download=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

