In [363]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

%matplotlib inline
import plotly.graph_objects as go

# Linear Regression

from sklearn import linear_model
from scipy import signal


In [364]:
dataPath_Raw = ("../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
dataPath_Processed = ("../data/processed/")

pd.set_option("display.max_rows", 500)

mpl.rcParams['figure.figsize'] = (20,16)
pd.set_option('display.max_rows', 500)

### Helper Functions

In [365]:
def quick_plot(x_in, df_input, yscale='log', slider=False):
    fig = go.Figure()
    
    for i in df_input.columns:
        fig.add_trace(go.Scatter(x = x_in,
                         y = df_input[i],
                         mode = 'lines',
                         marker_size = 3,
                         name = i)
                  )

    fig.update_layout(autosize = True,
              width = 1024,
              height = 780,
              font = dict(family = 'PT Sans. monospace',
                          size = 18,
                          color = '#7f7f7f'
              
                          )
                     )
    fig.update_yaxes(type=yscale),

    fig.update_xaxes(tickangle = -45,
                    nticks = 20,
                    tickfont = dict(size= 14, color = '#7f7f7f')
                    )
    
    if slider == True:

        fig.update_layout(xaxis_rangeslider_visible= True)
        
    fig.show()

In [366]:
df_analyse = pd.read_csv(dataPath_Processed + "COVID_Flat_Table.csv", sep=",",
                        parse_dates=[0])

df_analyse.sort_values('date',ascending=True).tail()



Unnamed: 0,date,Italy,US,Spain,Germany,India,"Korea, South",China,Brazil
136,2020-06-06,234801,1925765,241310,185450,246622,11776,84186,672846
137,2020-06-07,234998,1943882,241550,185750,257486,11814,84191,691758
138,2020-06-08,235278,1961428,241717,186109,265928,11852,84195,707412
139,2020-06-09,235561,1979489,241966,186506,276146,11902,84198,739503
140,2020-06-10,235763,2000464,242280,186522,276583,11947,84209,772416


In [367]:
'''country_list = [
    'Italy',
    'US',
    'Spain',
    'Germany',
    'India',
    'Korea, South',
    'China',
    'Brazil'
]'''

"country_list = [\n    'Italy',\n    'US',\n    'Spain',\n    'Germany',\n    'India',\n    'Korea, South',\n    'China',\n    'Brazil'\n]"

## Understanding Linear Regression

In [368]:
reg = linear_model.LinearRegression(fit_intercept=True)

In [369]:
l_vec = len(df_analyse['Germany'])
x = np.arange(l_vec).reshape(-1,1)
y = np.array(df_analyse['Germany'])

In [370]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [371]:
x_hat = np.arange(l_vec).reshape(-1,1)

y_hat = reg.predict(x_hat)

In [372]:
LR_inspect = df_analyse[['date', 'Germany']].copy()

In [373]:
LR_inspect['prediction'] = y_hat

In [374]:
quick_plot(LR_inspect.date, LR_inspect.iloc[:,1:], yscale='log', slider=True)

In [376]:
l_vec = len(df_analyse['Germany'])
x = np.arange(l_vec-5).reshape(-1,1)
y = np.log(np.array(df_analyse['Germany'][5:]))

reg.fit(x,y)

x_hat = np.arange(l_vec).reshape(-1,1)
y_hat = reg.predict(x_hat)

LR_inspect = df_analyse[['date', 'Germany']].copy()

LR_inspect['prediction'] = np.exp(y_hat)

quick_plot(LR_inspect.date, LR_inspect.iloc[:,1:], yscale='log', slider=True)

### Doubling Rate - Piecewise Linear Regression

In [377]:
df_analyse = pd.read_csv(dataPath_Processed + "COVID_Flat_Table.csv", sep=",",
                        parse_dates=[0])

country_list=df_analyse.columns[1:]

In [378]:

## filter data
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                           5, # window size used for filtering
                           1) # order of fitted polynomial

In [379]:
filter_cols=['Italy_filter','US_filter', 'Spain_filter', 'Germany_filter']

In [381]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:], #['US','US_filter']
           yscale='log',
           slider=True)

In [382]:

df_analyse.head()

Unnamed: 0,date,Italy,US,Spain,Germany,India,"Korea, South",China,Brazil,Italy_filter,US_filter,Spain_filter,Germany_filter,India_filter,"Korea, South_filter",China_filter,Brazil_filter
0,2020-01-22,0,1,0,0,0,1,548,0,0.0,0.4,0.0,0.0,0.0,0.8,355.0,0.0
1,2020-01-23,0,1,0,0,0,1,643,0,0.0,1.3,0.0,0.0,0.0,1.3,736.7,0.0
2,2020-01-24,0,2,0,0,0,2,920,0,0.0,2.2,0.0,0.0,0.0,1.8,1118.4,0.0
3,2020-01-25,0,2,0,0,0,2,1406,0,0.0,3.0,0.0,0.2,0.0,2.4,1584.2,0.0
4,2020-01-26,0,5,0,0,0,3,2075,0,0.0,3.8,0.0,1.0,0.0,3.0,2557.4,0.0


In [383]:
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [384]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [386]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [387]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [388]:
df_analyse['Germany_DR_math']=df_analyse['Germany'].rolling(
                                window=days_back,
                                min_periods=days_back).apply(doubling_time, raw=False)

In [389]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [390]:
df_analyse.columns

Index(['date', 'Italy', 'US', 'Spain', 'Germany', 'India', 'Korea, South',
       'China', 'Brazil', 'Italy_filter', 'US_filter', 'Spain_filter',
       'Germany_filter', 'India_filter', 'Korea, South_filter', 'China_filter',
       'Brazil_filter', 'Italy_DR', 'US_DR', 'Spain_DR', 'Germany_DR',
       'India_DR', 'Korea, South_DR', 'China_DR', 'Brazil_DR',
       'Italy_filter_DR', 'US_filter_DR', 'Spain_filter_DR',
       'Germany_filter_DR', 'Germany_DR_math'],
      dtype='object')

In [397]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[17,18,19,20,21,22,23,24]], #
           yscale='linear',
           slider=True)

In [400]:

start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[25,26,27,28]], #17,18,19   # US comparison 11,18
           yscale='linear',
           slider=True)