In [133]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

%matplotlib inline
import plotly.graph_objects as go

# Linear Regression

from sklearn import linear_model
from scipy import signal


In [134]:
dataPath_Raw = ("../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
dataPath_Processed = ("../data/processed/")

pd.set_option("display.max_rows", 500)

mpl.rcParams['figure.figsize'] = (20,16)
pd.set_option('display.max_rows', 500)

In [135]:
df_analyse = pd.read_csv(dataPath_Processed + "COVID_Flat_Table_small.csv", sep=",",
                        parse_dates=[0])

df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,date,Italy,US,Spain,Germany,India,"Korea, South",China,Brazil,United Kingdom
145,2020-06-15,237290,2114026,244109,187682,343091,12155,84378,888271,298315
146,2020-06-16,237500,2137731,244328,188252,354065,12198,84422,923189,299600
147,2020-06-17,237828,2163290,244683,188604,366946,12257,84458,955377,300717
148,2020-06-18,238159,2191052,245268,189817,380532,12306,84494,978142,301935
149,2020-06-19,238011,2220961,245575,190299,395048,12373,84494,1032913,303285


### Helper Functions

In [136]:
def quick_plot(x_in, df_input, yscale='log', slider=False):
    fig = go.Figure()
    
    for i in df_input.columns:
        fig.add_trace(go.Scatter(x = x_in,
                         y = df_input[i],
                         mode = 'lines',
                         marker_size = 3,
                         name = i)
                  )

    fig.update_layout(autosize = True,
              width = 800,
              height = 600,
              font = dict(family = 'PT Sans. monospace',
                          size = 18,
                          color = '#7f7f7f'
              
                          )
                     )
    fig.update_yaxes(type=yscale),

    fig.update_xaxes(tickangle = -45,
                    nticks = 20,
                    tickfont = dict(size= 14, color = '#7f7f7f')
                    )
    
    if slider == True:

        fig.update_layout(xaxis_rangeslider_visible= True)
        
    fig.show()

In [137]:
country_list = [
    'Italy',
    'US',
    'Spain',
    'Germany',
    'India',
    'Korea, South',
    'China',
    'Brazil',
    'United Kingdom'
]

## Understanding Linear Regression

In [138]:
reg = linear_model.LinearRegression(fit_intercept=False)

In [139]:
l_vec = len(df_analyse['Germany'])
x = np.arange(l_vec).reshape(-1,1)
y = np.array(df_analyse['Germany'])

In [140]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [141]:
x_hat = np.arange(l_vec).reshape(-1,1)

y_hat = reg.predict(x_hat)

In [142]:
LR_inspect = df_analyse[['date', 'Germany']].copy()

In [143]:
LR_inspect['prediction'] = (y_hat)

In [144]:
quick_plot(LR_inspect.date, LR_inspect.iloc[:,1:], yscale='linear', slider=True)

### log

In [145]:
l_vec = len(df_analyse['Germany'])
xs = np.arange(l_vec-5).reshape(-1,1)
ys = np.log(np.array(df_analyse['Germany'][5:]))

In [146]:
reg.fit(xs,ys)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [147]:
x_hat = np.arange(l_vec).reshape(-1,1)

y_hat = reg.predict(x_hat)

In [148]:
LR_inspect = df_analyse[['date', 'Germany']].copy()
LR_inspect['prediction'] = np.exp(y_hat)

In [149]:
quick_plot(LR_inspect.date, LR_inspect.iloc[:,1:], yscale='linear', slider=True)

### Doubling Rate - Piecewise Linear Regression

In [150]:
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec = len(df_analyse['Germany'])
xs = np.arange(l_vec-50).reshape(-1,1)
ys = np.array(df_analyse['Germany'][50:])

In [151]:
reg.fit(xs,ys)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [152]:
reg.intercept_

42346.86594059404

In [153]:
reg.coef_

array([1870.83563756])

In [154]:
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [155]:
df_analyse = pd.read_csv(dataPath_Processed + "COVID_Flat_Table_small.csv", sep=",",
                        parse_dates=[0])

country_list=df_analyse.columns[1:]
df_analyse.head()

Unnamed: 0,date,Italy,US,Spain,Germany,India,"Korea, South",China,Brazil,United Kingdom
0,2020-01-22,0,1,0,0,0,1,548,0,0
1,2020-01-23,0,1,0,0,0,1,643,0,0
2,2020-01-24,0,2,0,0,0,2,920,0,0
3,2020-01-25,0,2,0,0,0,2,1406,0,0
4,2020-01-26,0,5,0,0,0,3,2075,0,0


In [156]:
df_analyse["Germany_DR"] = df_analyse['Germany'].rolling(window=3, min_periods=3).apply(get_doubling_time_via_regression)





In [157]:
quick_plot(df_analyse.date, df_analyse.iloc[80:,[4]], yscale='log', slider=True)

In [158]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [159]:
df_analyse["Germany_DT_wiki"] = df_analyse['Germany'].rolling(window=3, min_periods=3).apply(doubling_time)





In [160]:
quick_plot(df_analyse.date, df_analyse.iloc[5:,[4,11]], yscale='linear', slider=True)

In [161]:

## filter data
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                           5, # window size used for filtering
                           1) # order of fitted polynomial

In [162]:
filter_cols=['Italy_filter','US_filter', 'Spain_filter', 'Germany_filter']

In [174]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:], #['US','US_filter']
           yscale='linear',
           slider=True)

In [164]:

df_analyse.head()

Unnamed: 0,date,Italy,US,Spain,Germany,India,"Korea, South",China,Brazil,United Kingdom,...,Germany_DT_wiki,Italy_filter,US_filter,Spain_filter,Germany_filter,India_filter,"Korea, South_filter",China_filter,Brazil_filter,United Kingdom_filter
0,2020-01-22,0,1,0,0,0,1,548,0,0,...,,0.0,0.4,0.0,0.0,0.0,0.8,355.0,0.0,0.0
1,2020-01-23,0,1,0,0,0,1,643,0,0,...,,0.0,1.3,0.0,0.0,0.0,1.3,736.7,0.0,0.0
2,2020-01-24,0,2,0,0,0,2,920,0,0,...,,0.0,2.2,0.0,0.0,0.0,1.8,1118.4,0.0,0.0
3,2020-01-25,0,2,0,0,0,2,1406,0,0,...,,0.0,3.0,0.0,0.2,0.0,2.4,1584.2,0.0,0.0
4,2020-01-26,0,5,0,0,0,3,2075,0,0,...,,0.0,3.8,0.0,1.0,0.0,3.0,2557.4,0.0,0.0


In [165]:
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [166]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [167]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [168]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [169]:
df_analyse['Germany_DR_math']=df_analyse['Germany'].rolling(
                                window=days_back,
                                min_periods=days_back).apply(doubling_time, raw=False)

In [170]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [171]:
df_analyse.columns

Index(['date', 'Italy', 'US', 'Spain', 'Germany', 'India', 'Korea, South',
       'China', 'Brazil', 'United Kingdom', 'Germany_DR', 'Germany_DT_wiki',
       'Italy_filter', 'US_filter', 'Spain_filter', 'Germany_filter',
       'India_filter', 'Korea, South_filter', 'China_filter', 'Brazil_filter',
       'United Kingdom_filter', 'Italy_DR', 'US_DR', 'Spain_DR', 'India_DR',
       'Korea, South_DR', 'China_DR', 'Brazil_DR', 'United Kingdom_DR',
       'Italy_filter_DR', 'US_filter_DR', 'Spain_filter_DR',
       'Germany_filter_DR', 'Germany_DR_math'],
      dtype='object')

In [172]:
start_pos=100
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[17,18,19,20,21,22,23,24,25,26]], #
           yscale='linear',
           slider=True)

In [173]:

start_pos=100
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[25,26,27,28]], #17,18,19   # US comparison 11,18
           yscale='linear',
           slider=True)