# COVID 19 Jupyter Dashboard
This notebook reads in COVID19 data maintained by John Hopkins Universities and creates a dashboard.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import folium
import os,sys
import seaborn as sns
from folium.plugins import TimestampedGeoJson
import math
import matplotlib.pyplot as plt
from scipy import optimize
from scipy.optimize import curve_fit
pd.set_option('display.max_rows', 1000)

In [None]:
#### PATHS ####
output='output/'

#### HELPER FUNCTIONS ####
def describe(df):
    return pd.concat([df.describe().T,df.sum().rename('sum')], axis=1).T
def indloc(df,field,values):
    return(df[df.index.get_level_values(field).isin(values)])



#### LOADING AND PRE-PROCESSING ####
def load_data():
    explore_data=False
    ts_path=sys.path[0]+'/csse_covid_19_data/csse_covid_19_time_series/'
    df_deaths=pd.read_csv(ts_path+'time_series_covid19_deaths_global.csv')
    df_recovered=pd.read_csv(ts_path+'time_series_covid19_recovered_global.csv')
    df_confirmed=pd.read_csv(ts_path+'time_series_covid19_confirmed_global.csv')
    # Explore data
    if(explore_data==True):
        print('Describe Confirmed')
        display(describe(df_confirmed))
        print('Describe Confirmed')
        display(df_confirmed.head())
    return df_deaths,df_recovered,df_confirmed

def uncolonize(df):
    df_helper = df.groupby('region').filter(lambda x: len(x)>1 and any(x['subregion'].isna()))
    colonizers = df_helper['region'].unique()
    colonized = df_helper['subregion'].unique()

    # and we give it back to you... the people
    for i,row in df.iterrows():
        if(row['subregion'] in colonized):
            df.at[i,'region'] = df.at[i,'subregion']        
            df.at[i,'subregion'] = np.nan        

    return(df)


def format_input(df,name): 
    df.rename(columns={'Province/State':'subregion','Country/Region':'region','Lat':'lat','Long':'long'},inplace=True)
    df = uncolonize(df)
    df['subregion'].fillna(df['region']+'_country',inplace=True)

    df = df.set_index(['subregion','region','lat','long']).stack()
    df.index.names = ['subregion','region','lat','long','dates']
    df.name = name
    df = df.reset_index()
    df['dates'] = pd.to_datetime(df['dates'],format='%m/%d/%y')

    return df


def combine_data(df_deaths,df_confirmed,df_recovered):
    join_cols = ['subregion','region','lat','long','dates']
    df_all = format_input(df_deaths,'death')
    df_all = pd.merge(df_all,format_input(df_confirmed,'confirmed'),on=join_cols,how='outer')
    df_all = pd.merge(df_all,format_input(df_recovered,'recovered'),on=join_cols,how='outer')
    df_all = df_all.groupby(join_cols).sum().reset_index()

    return df_all

def augment_dataset(df, level):

    # Add # active cases
    df['active'] = df['confirmed'] - df['death'] - df['recovered']
    
    # Add day count column
    pivotalDates = df.groupby(level)['dates'].min()
    df['days'] = df.merge(pivotalDates,how='left',on=[level],suffixes=('', '_r')).apply(lambda x: (x['dates']-x['dates_r']).days,axis=1).fillna(0)
    df['days'] = df['days'].astype(int)

    # Add days since 200th case
    pivotalDates = df[df['confirmed']>200].groupby(level)['dates'].min()
    df['days_since_200'] = df.merge(pivotalDates,how='left',on=[level],suffixes=('', '_r')).apply(lambda x: (x['dates']-x['dates_r']).days,axis=1).fillna(0)
    df['days_since_200'] = df['days_since_200'].astype(int)
    
    # Add deltas
    columns = ['confirmed','death','recovered']
    df_deltas = df.groupby([level,'dates'])[columns].sum()
    df_deltas = df_deltas.groupby(level)[columns].diff()
    df_deltas.rename(columns={'confirmed':'confirmed_delta',
                              'death':'death_delta',
                              'recovered':'recovered_delta'}
                     ,inplace=True)
    df = df.merge(df_deltas, how='left',on=[level,'dates'])


    
    # Add % Death
    df['death_pc'] = round(df['death'] /df['confirmed']*100,2)

    # Add doubling rate
    df = df.merge(doubling_rate(df,level),how='left',on=[level,'days'])
    
    return df

def material_countries(df,additional):
    material_countries = list(df.groupby('region').max().nlargest(6,'confirmed').index)
    [material_countries.append(i) for i in additional]

    df = df[df['region'].isin(material_countries)]

    return df

def latest_data(df):
    return(df[df['dates']==df['dates'].max()])


#### EXPONENTIAL FITTING ####
def line(x,a,b): 
    return a*x+b

def get_doubling_rate(x):
    zero=0.000000001
    if((x[0]<=zero) and (x[0]>=-zero)):
        return np.nan
    else:
        # Solve for c by looking at coefficient of x
        # log[d 2^(x/c-f/c)] = a x + b
        return np.log(2)/x[0]

def doubling_rate(df,agg_level):

    lookback = 1000
    sample_size = 15
    double_rate=[]

    for index,group in  df.groupby(agg_level)[['days','confirmed']]:
        maxdays = group['days'].max()
        for day in range(maxdays,maxdays-lookback,-1):
    
            if(day<sample_size):
                break
            
            x_data = group['days'].values
            y_data = group['confirmed'].values

            filters = np.argwhere((x_data<=day) & (x_data>=day-sample_size)).T[0] 
            
            if(len(filters)<sample_size):
                break
            
            x_data = x_data[filters]
            y_data = y_data[filters]

            if(np.any(y_data==0)):
                break
            else:
                y_data = np.log(y_data)
            
            
            popt, pcov = curve_fit(line, x_data, y_data,p0=[1.5,0])
            rate = round(get_doubling_rate(popt),2)
            double_rate.append({agg_level:index,'days':day,'doubling_rate':rate,'best_fit':popt})

    return(pd.DataFrame(double_rate))



#### INTERACTIVE MAPS ####

def color_producer(rate):
    color_scale = np.array(['#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#f7f7f7','#d1e5f0','#92c5de','#4393c3','#2166ac','#053061'])
    scale=1
    col=''
    if(np.isnan(rate)):
        col='grey'
    else:
        if(rate<=0):
            col=color_scale[0]
        for i in range(1,10):
            if(i/scale<=rate and rate<(i+1)/scale):
                col=color_scale[i]
        if(rate>=10/scale):
            col=color_scale[10]
    return(col)


def create_geojson_features(df,level):
    print('> Creating GeoJSON features...')
    features = []
    for _, row in df.iterrows():
        
        
        feature = {
            'type': 'Feature',
            'geometry': {
                'type':'Point', 
                'coordinates':[row['long'],row['lat']]
            },
            'properties': {
                'time': row['dates'].__str__(),
                'style': {'color' : color_producer(row['doubling_rate'])},
                'icon': 'circle',
                'popup':'{} <br> cases: {} <br> death: {} ({}%)  <br> recovery: {} <br> days2double: {}'.format(row[level],
                                                                                                                row['confirmed'],
                                                                                                                row['death'],
                                                                                                                row['death_pc'],
                                                                                                                row['recovered'],
                                                                                                                row['doubling_rate']),
                'iconstyle':{
                    'fillColor': color_producer(row['doubling_rate']),
                    'fillOpacity': 0.8,
                    'stroke': 'false',
                    'radius': float(np.log(row['confirmed']+1))
                }
            }
        }
        features.append(feature)
    return features

def make_map(features):
    print('> Making map...')
    my_map = folium.Map(location=[20,0],height='100%', control_scale=True,  zoom_start=1.6)

    TimestampedGeoJson(
        {'type': 'FeatureCollection',
        'features': features}
        , period='P1D'
        , add_last_point=True
        , auto_play=False
#        , transition_time=10
        , loop=False
        , duration='P1D'
        , max_speed=50
        , loop_button=True
        , date_options='YYYY/MM/DD'
        , time_slider_drag_update=True
    ).add_to(my_map)
    print('> Done.')
    return my_map


#### PLOTS ####
def format_for_plot(df,level):
    df = df[df['days_since_200']>=0]
    df = df.set_index(['days_since_200',level]).unstack(level)
    return(df)
    

In [None]:
# Load Data
df_deaths,df_recovered,df_confirmed = load_data()
df_all = combine_data(df_deaths,df_confirmed,df_recovered)
df_all = augment_dataset(df_all,'subregion')

# Region level view
df_region = df_all.groupby(['region','dates']).agg({'death':sum,'recovered':sum,'confirmed':sum,'lat':np.mean,'long':np.mean}).reset_index()
df_region = augment_dataset(df_region,'region')

# Material countries only
df_all_material = material_countries(df_all,['Canada','Argentina'])
df_region_material = material_countries(df_region,['Canada','Argentina'])

# Today's data only
df_all_today = latest_data(df_all)
df_region_today = latest_data(df_region)


In [None]:
# Quick sanity check on best fits
fig,ax = plt.subplots(figsize=(15,10));

df = df_region_material.set_index(['days','region']).unstack('region')
colors = ['b','g','r','c','m','y','k','orange']
np.log(df[df['confirmed']>0]['confirmed']).plot(ax=ax,style=colors)

dat=[]
for index,row in df.iterrows():
    if(index%20 == 0):
        for name,v in row['best_fit'].iteritems():
            try:
                math.isnan(v)
            except:
                dat.append([[name,i,v[0]*i+v[1]] for i in range(index-15,index+1)])

dat = [i for sublist in dat for i in sublist]
df_calibrate = pd.DataFrame(dat,columns=['region','days','value'])
df_calibrate = df_calibrate.set_index(['days','region']).unstack('region')['value']
df_calibrate.plot(ax=ax,linestyle='dashed',style=colors);

ax.set_yscale('log')
ax.set_xlim(30,64)
ax.set_ylim(2,15)


plt.grid(which='both')
plt.savefig(output+'DoublingRate.Calibration.pdf')
plt.close()

In [None]:
# Pre-process
df_unstacked = format_for_plot(df_region_material,'region')
xmax = int(df_unstacked.index.max()-20)

df_ref = pd.DataFrame([i for i in range(xmax+1)],columns=['days_since_200'])
df_ref['Doubles every 3 days']=df_ref['days_since_200'].apply(lambda x: 200*1.26**x)
df_ref.set_index('days_since_200',inplace=True)

df_ref.index.name = ''
df_unstacked.index.name=''

# Global properties
fig, ax = plt.subplots(4, 1, sharex=True,figsize=(15,20))
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', top='off', bottom='off', left='off', right='off')
plt.grid(False)
plt.xlabel('# days since 200$^{\mathrm{th}}$ case')

# Total confirmed cases
df_unstacked['confirmed'].plot(ax=ax[0])
df_ref.plot(ax=ax[0])

ax[0].set_xlim(0,xmax)
ax[0].set_ylim(200,1000000)
ax[0].set_ylabel('# cases')
ax[0].set_yscale('log')
ax[0].grid(which='major')

# Daily confirmed deltas
df_unstacked['confirmed_delta'].plot(ax=ax[1])
ax[1].set_xlim(0,xmax)
ax[1].set_ylim(10,50000)
ax[1].set_ylabel('# new cases')
ax[1].set_yscale('log')
ax[1].grid(which='major')


# Active Cases
df_unstacked['active'].plot(ax=ax[2])
ax[2].set_xlim(0,xmax)
ax[2].set_ylim(100,200000)
ax[2].set_ylabel('# active cases')
ax[2].set_yscale('log')
ax[2].grid(which='major')


# Doubling rate
df_unstacked['doubling_rate'].plot(ax=ax[3])
ax[3].set_xlim(0,xmax)
ax[3].set_ylim(0,10)
ax[3].set_ylabel('# days until cases double')
ax[3].grid(which='major')


plt.savefig(output+'Covid19.figs.pdf')
#plt.close()
plt.show()

In [None]:
#Movies

df = df_all[~df_all['subregion'].str.contains('_country')]
m = make_map(create_geojson_features(df,'subregion'))
m.save(output+'covid19_cities.html')

df = df_region
m = make_map(create_geojson_features(df,'region'))
m.save(output+'covid19_countries.html')

