# 1. Updating the all data

In [1]:
# %load ../src/data/get_data.py
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_john_hopkins_data():
    git_pull = subprocess.Popen("/usr/bin/git pull",
                                cwd = os.path.dirname('../data/raw/COVID-19/'),
                                shell = True,
                                stdout = subprocess.PIPE,
                                stderr = subprocess.PIPE)
    
    (out, error) = git_pull.communicate()
    
    print("Error : " + str(error))
    print("out : "  + str(out))
    
def get_current_data_germany():
    # Total states = 16
    # data = requests.get("https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json")
    
    # 400 regions
    data = requests.get("https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json")
    
    json_object = json.loads(data.content)
    full_list = []
    for pos, each_dict in enumerate (json_object['features'] [:]):
        full_list.append(each_dict['attributes'])
        
    pd_full_list = pd.DataFrame(full_list)
    pd_full_list.to_csv('../data/raw/NPGEO/Ger_state_data.csv', sep = ';')
    print('Number of regions rows : ' +str(pd_full_list.shape[0]))
    
if __name__ == '__main__':
    get_john_hopkins_data()
    get_current_data_germany()

Error : b'From https://github.com/CSSEGISandData/COVID-19\n   1326d8d8..a041275a  master              -> origin/master\n * [new branch]        2962-Update-Mexico-July-25 -> origin/2962-Update-Mexico-July-25\n * [new branch]        2963-Update-Hidalgo -> origin/2963-Update-Hidalgo\n   0f1fb49c..dcaf17e5  web-data            -> origin/web-data\n'
out : b'Updating 1326d8d8..a041275a\nFast-forward\n README.md                                          |    1 +\n csse_covid_19_data/README.md                       |    6 +-\n .../csse_covid_19_daily_reports/07-25-2020.csv     |   66 +-\n .../csse_covid_19_daily_reports/07-30-2020.csv     | 3936 ++++++++++++\n .../csse_covid_19_daily_reports/07-31-2020.csv     | 3936 ++++++++++++\n .../csse_covid_19_daily_reports/08-01-2020.csv     | 3936 ++++++++++++\n .../csse_covid_19_daily_reports_us/07-30-2020.csv  |   59 +\n .../csse_covid_19_daily_reports_us/07-31-2020.csv  |   59 +\n .../csse_covid_19_daily_reports_us/08-01-2020.csv  |   59 +\n .../csse

# 2. Process pipeline

In [2]:
# %load ../src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime

def store_relational_JH_data():
    
    data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)
    
    pd_data_base = pd_raw.rename(columns = {'Country/Region': 'country',
                                           'Province/State': 'state'})
    
    pd_data_base['state'] = pd_data_base['state'].fillna('no')
    
    pd_data_base = pd_data_base.drop(['Lat', 'Long'], axis = 1)
    
    pd_relational_model = pd_data_base.set_index(['state', 'country'])\
                                    .T                                \
                                        .stack(level = [0,1])         \
                                            .reset_index()           \
                                                .rename(columns = {'level_0' : 'date',
                                                                   0: 'confirmed'},
                                                        )
    pd_relational_model['date'] = pd_relational_model.date.astype('datetime64[ns]')
    
    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv', sep = ';', index = False)
    print('Number of rows stored :' +str(pd_relational_model.shape[0]))
    
if __name__ == '__main__':
    
    store_relational_JH_data()

Number of rows stored :51338


# 3. Filter and Doubling Rate calculation

In [12]:
# %load ../src/features/build_features.py
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept = True)
import pandas as pd

from scipy import signal

def get_doubling_time_via_regression(in_array):
    '''Using a linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1,1)
    
    assert len(in_array) == 3
    reg.fit(X,y)
    intercept = reg.intercept_
    slope = reg.coef_
    
    return intercept/slope

def savgol_filter(df_input, column = 'confirmed', window = 5):
    ''' Savgol filter which can be used in groupby apply functions
    it ensures that the data structure is kept'''
    
    window = 5,
    degree = 1
    df_result = df_input
    
    filter_in = df_input[column].fillna(0)
    
    result = signal.savgol_filter(np.array(filter_in),
                                  5,
                                  1)
    df_result[column+'_filtered'] = result
    return df_result

def rolling_reg(df_input, col = 'confirmed'):
    '''input has to be a data frame'''
    
    days_back = 3
    result = df_input[col].rolling(
                    window = days_back,
                    min_periods = days_back).apply(get_doubling_time_via_regression, raw = False)
    return result

def calc_filtered_data(df_input,filter_on = 'confirmed'):
    '''
        calculate the savgol filter and return the merged data frame

    '''
    must_contain = set(['state','country', filter_on])
    assert must_contain.issubset(set(df_input.columns)), 'error in calc_filtered_data not all columns in data frame'
    
    pd_filtered_result = df_input[['state','country', filter_on]].groupby(['state','country']).apply(savgol_filter).reset_index()
    df_output = pd.merge(df_input, pd_filtered_result[['index', filter_on+'_filtered']], on = ['index'], how = 'left')
    
    return df_output

def calc_doubling_rate(df_input,filter_on = 'confirmed'):
    '''
        calculate approximated doubling rate and return merged data frame
    '''
    
    must_contain = set(['state','country', filter_on])
    assert must_contain.issubset(set(df_input.columns)), 'Error in calc_filtered_data not all columns in data frame'
    
    pd_DR_result = df_input.groupby(['state', 'country']).apply(rolling_reg, filter_on).reset_index()
    pd_DR_result = pd_DR_result.rename(columns = {filter_on: filter_on+'_DR',
                                                  'level_2': 'index'})
    
    df_output = pd.merge(df_input, pd_DR_result[['index', filter_on + '_DR']], on = ['index'], how = 'left')
    return df_output

def pd_result_large_final():
    pd_JH_data = pd.read_csv('../data/processed/COVID_relational_confirmed.csv', sep = ';', parse_dates = [0])
    pd_JH_data = pd_JH_data.sort_values('date', ascending = True).reset_index().copy()
    
    pd_result_large = calc_filtered_data(pd_JH_data)
    pd_result_large = calc_doubling_rate(pd_result_large)
    pd_result_large = calc_doubling_rate(pd_result_large, 'confirmed_filtered')
    pd_result_large.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)

    


if __name__ == '__main__':
    test_data_reg = np.array([2,4,6])
    result = get_doubling_time_via_regression(test_data_reg)
    print('The test slope is:' + str(result))
    
    pd_result_large_final()
    print(pd_result_large_final.head())
    
    
    

    


The test slope is:[2.]
   index       date    state       country  confirmed  confirmed_filtered  \
0      0 2020-01-22  Alberta        Canada        0.0                 0.0   
1    169 2020-01-22       no  Korea, South        1.0                -4.8   
2    170 2020-01-22       no        Kosovo        0.0                 0.0   
3    171 2020-01-22       no        Kuwait        0.0                 0.0   
4    172 2020-01-22       no    Kyrgyzstan        0.0                10.8   

   confirmed_DR  confirmed_filtered_DR  
0           NaN                    NaN  
1           NaN                    NaN  
2           NaN                    NaN  
3           NaN                    NaN  
4           NaN                    NaN  
