In [1]:
import subprocess
import os
import numpy as np
import pandas as pd

import matplotlib

from datetime import datetime

from sklearn import linear_model

from scipy import signal


pd.set_option('display.max_rows', 500)

# John Hopkins CSV data


In [2]:
git_pull = subprocess.Popen( "git pull" , 
                     cwd = os.path.dirname( '../data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

Error : b'From https://github.com/CSSEGISandData/COVID-19\n   e677c665..a10636a9  master                 -> origin/master\n * [new branch]        Fix-Dailies-Aug-15     -> origin/Fix-Dailies-Aug-15\n * [new branch]        Pull-MA-Updates-Aug-16 -> origin/Pull-MA-Updates-Aug-16\n * [new branch]        Update-Brazil-Aug-12   -> origin/Update-Brazil-Aug-12\n   1e5bd023..6854f5ab  web-data               -> origin/web-data\n'
out : b'Updating e677c665..a10636a9\nFast-forward\n README.md                                          |   34 +-\n csse_covid_19_data/README.md                       |   13 +-\n csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv   |    8 +-\n .../csse_covid_19_daily_reports/08-10-2020.csv     | 3941 ++++++++++++\n .../csse_covid_19_daily_reports/08-11-2020.csv     | 3944 ++++++++++++\n .../csse_covid_19_daily_reports/08-12-2020.csv     | 3944 ++++++++++++\n .../csse_covid_19_daily_reports/08-13-2020.csv     | 3946 ++++++++++++\n .../csse_covid_19_daily_reports/08-14-2020

# Building relational data model

In [3]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)


pd_db = pd_raw.rename(columns= {
    'Country/Region': 'country',
    'Province/State': 'state'
}).drop(['Lat', 'Long'], axis=1)
pd_db  = pd_db.fillna('no')


In [5]:
pd_JH_data = pd_db.set_index(['state', 'country']).T.stack(level =[0,1]).reset_index()

pd_JH_data = pd_JH_data.rename(columns ={
    'level_0' : 'date',
    0: 'confirmed'
})

pd_JH_data['date'] = pd.to_datetime(pd_JH_data['date'], format="%m/%d/%y")

pd_JH_data.to_csv('../data/processed/COVID_relational_confirmed.csv', sep = ';', index=False)

pd_JH_data = pd_JH_data.sort_values('date').reset_index(drop=True).copy()

# Groupby apply

## Calculating doubling rate with groupby apply

In [6]:
reg = linear_model.LinearRegression(fit_intercept=True)


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

In [7]:
def rolling_reg(df_input: pd.DataFrame, col='confirmed', days_back=3):
    result = df_input[col].rolling(
            window=days_back, 
            min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)
    return result

In [8]:
pd_DR_result = pd_JH_data[['state', 'country', 'confirmed']].groupby(['state', 'country']).apply(rolling_reg, 'confirmed').reset_index()
pd_DR_result = pd_DR_result.rename(columns={
    'confirmed': 'doubling_rate',
    'level_2': 'index'
})


In [9]:
pd_JH_data = pd_JH_data.reset_index()

In [10]:
pd_results_large = pd.merge(pd_JH_data, pd_DR_result[['index', 'doubling_rate']], on=['index'], how='left')

## Filtering data with groupby apply

In [11]:
def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

In [12]:
pd_filtered_result = pd_results_large[['state', 'country', 'confirmed']].groupby(['state', 'country']).apply(savgol_filter).reset_index()

In [13]:
pd_results_large = pd.merge(pd_results_large, pd_filtered_result[['index', 'confirmed_filtered']], on=['index'], how='left')
pd_results_large.head()

Unnamed: 0,index,date,state,country,confirmed,doubling_rate,confirmed_filtered
0,0,2020-01-22,Alberta,Canada,0.0,,0.0
1,1,2020-01-22,no,"Korea, South",1.0,,0.8
2,2,2020-01-22,no,Kosovo,0.0,,0.0
3,3,2020-01-22,no,Kuwait,0.0,,0.0
4,4,2020-01-22,no,Kyrgyzstan,0.0,,0.0


## Calculating filtered doubling rate with groupby apply

In [14]:
pd_filtered_dr = pd_results_large[['state', 'country', 'confirmed_filtered']].groupby(['state', 'country']).apply(rolling_reg, 'confirmed_filtered').reset_index()

pd_filtered_dr = pd_filtered_dr.rename(columns={
    'confirmed_filtered': 'doubling_rate_filtered',
    'level_2': 'index'
})


In [15]:
pd_results_large = pd.merge(pd_results_large, pd_filtered_dr[['index', 'doubling_rate_filtered']], on=['index'], how='left')
pd_results_large.head()

Unnamed: 0,index,date,state,country,confirmed,doubling_rate,confirmed_filtered,doubling_rate_filtered
0,0,2020-01-22,Alberta,Canada,0.0,,0.0,
1,1,2020-01-22,no,"Korea, South",1.0,,0.8,
2,2,2020-01-22,no,Kosovo,0.0,,0.0,
3,3,2020-01-22,no,Kuwait,0.0,,0.0,
4,4,2020-01-22,no,Kyrgyzstan,0.0,,0.0,


In [16]:
mask = pd_results_large['confirmed'] > 100
pd_results_large['doubling_rate_filtered'] = pd_results_large['doubling_rate_filtered'].where(mask, other=np.NaN)

In [46]:
pd_results_large = pd_results_large.rename(columns= {
    'doubling_rate_filtered': 'confirmed_filtered_DR',
    'doubling_rate': 'confirmed_DR'
})
pd_results_large.to_csv('../data/processed/COVID_final_set.csv', sep = ';', index=False)

In [20]:
country_list = ['US', 'Germany', 'Italy']

In [44]:
import plotly.graph_objects as go
import math
fig = go.Figure()



for each in country_list:
    
    df_plot = pd_results_large[pd_results_large['country'] == each]
    df_plot=df_plot[['state','country','confirmed','confirmed_filtered','doubling_rate','doubling_rate_filtered','date']].groupby(['country','date']).agg(np.mean).reset_index()
    
    fig.add_trace(go.Scatter(
        x=df_plot.date,
        y=df_plot['doubling_rate'],
        mode='markers+lines',
        name=each,
        marker_size=4,
        opacity=0.9,
        line_width=2
        )
    )
    
    

fig.update_layout(
    width=900,
    height=600,
    xaxis_title="Time",
    yaxis_title="Confirmed infected cases (SOURCE: John Hopkins CSSE, log-scale)",
    xaxis_rangeslider_visible=True
)

fig.update_layout()

# fig.update_yaxes(type='log', range=[1.1, math.log10((df_plot.max()[1:].max()//1000 + 1) * 1000)])

In [38]:
df_plot = pd_results_large[pd_results_large['country'] == 'Germany']

In [39]:
df_plot=df_plot[['state','country','confirmed','confirmed_filtered','doubling_rate','doubling_rate_filtered','date']].groupby(['country','date']).agg(np.mean).reset_index()

In [31]:
df_plot.columns

Index(['index', 'date', 'state', 'country', 'confirmed', 'doubling_rate',
       'confirmed_filtered', 'doubling_rate_filtered'],
      dtype='object')

In [42]:
df_plot.date[103]

Timestamp('2020-05-04 00:00:00')

In [43]:
df_plot['doubling_rate_filtered'][103]

204.8486947915371

In [1]:
[1, 2, 4][:5]

[1, 2, 4]

In [None]:
{"_id":276,"iso2":"DE","iso3":"DEU","lat":51,"long":9,"flag":"https://disease.sh/assets/img/flags/de.png"},"cases":260149,"todayCases":424,"deaths":9424,"todayDeaths":1,"recovered":234850,"todayRecovered":1550,"active":15875,"critical":233,"casesPerOneMillion":3103,"deathsPerOneMillion":112,"tests":13436301,"testsPerOneMillion":160266,"population":83837542,"continent":"Europe","oneCasePerPeople":322,"oneDeathPerPeople":8896,"oneTestPerPeople":6,"activePerOneMillion":189.35,"recoveredPerOneMillion":2801.25,"criticalPerOneMillion":2.78},