In [3]:
import pandas as pd

### 1. update all data

In [7]:
data_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
raw_data = pd.read_csv(data_url, sep=",")

In [9]:
raw_data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [8]:

raw_data.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [14]:
processed_data = raw_data.loc[:, ["date", "location", "total_cases", "people_vaccinated"]]

In [15]:
processed_data.head()

Unnamed: 0,date,location,total_cases,people_vaccinated
0,2020-02-24,Afghanistan,5.0,
1,2020-02-25,Afghanistan,5.0,
2,2020-02-26,Afghanistan,5.0,
3,2020-02-27,Afghanistan,5.0,
4,2020-02-28,Afghanistan,5.0,


In [16]:
processed_data.dtypes

date                  object
location              object
total_cases          float64
people_vaccinated    float64
dtype: object

### 2. Process pipeline


In [12]:
import numpy as np

from datetime import datetime

In [29]:
def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''
    
    data_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
    pd_raw = pd.read_csv(data_url, sep=",")
    
    pd_relational_model = pd_raw.loc[:, ["date", "location", "total_cases", "people_vaccinated"]]
    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')
    pd_relational_model=pd_relational_model.rename(columns={'total_cases':'confirmed',
                      'location':'country'})

    pd_relational_model.to_csv('/Users/pawnesh/ws/covid_data_science/covid_19_data_stats/data/raw/COVID-19/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
    print(pd_relational_model.head())
    print(pd_relational_model.dtypes)

In [30]:
store_relational_JH_data()

 Number of rows stored: 202186
 Latest date is: 2022-07-18 00:00:00
        date      country  confirmed  people_vaccinated
0 2020-02-24  Afghanistan        5.0                NaN
1 2020-02-25  Afghanistan        5.0                NaN
2 2020-02-26  Afghanistan        5.0                NaN
3 2020-02-27  Afghanistan        5.0                NaN
4 2020-02-28  Afghanistan        5.0                NaN
date                 datetime64[ns]
country                      object
confirmed                   float64
people_vaccinated           float64
dtype: object


In [27]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.1.1-cp38-cp38-macosx_10_13_x86_64.whl (8.5 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting scipy>=1.3.2
  Using cached scipy-1.8.1-cp38-cp38-macosx_12_0_universal2.macosx_10_9_x86_64.whl (55.3 MB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.1.0 scikit-learn-1.1.1 scipy-1.8.1 threadpoolctl-3.1.0
You should consider upgrading via the '/Users/pawnesh/ws/covid_data_science/covid_19_data_stats/venv/bin/python -m pip install --upgrade pip' command.[0m


### 3 Filter and Doubling Rate Calculation

In [23]:
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here
    try:
        result=signal.savgol_filter(np.array(filter_in),
                               window, # window size used for filtering
                               1)
    except (UnboundLocalError, ValueError) as e:
        print(e)
        result = filter_in
    
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result




def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten
    
    pd_filtered_result=df_output[['country',filter_on]].groupby(['country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())
    
    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'
    

    pd_DR_result= df_input.groupby(['country']).apply(rolling_reg,filter_on).reset_index()
    #import pdb; pdb.set_trace()
    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_1':'index'})
    #import pdb; pdb.set_trace()
    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output

In [24]:
test_data_reg=np.array([2,4,6])
result=get_doubling_time_via_regression(test_data_reg)
print('the test slope is: '+str(result))

pd_JH_data=pd.read_csv('/Users/pawnesh/ws/covid_data_science/covid_19_data_stats/data/raw/COVID-19/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
pd_JH_data = pd_JH_data.reset_index()
pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

#test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
#                  (pd_JH_data['country']=='Germany'))]

pd_result_larg=calc_filtered_data(pd_JH_data)
pd_result_larg=calc_doubling_rate(pd_result_larg)
pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


mask=pd_result_larg['confirmed']>100
pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)


the test slope is: [2.]
If mode is 'interp', window_length must be less than or equal to the size of x.


In [25]:
pd_result_larg.to_csv('/Users/pawnesh/ws/covid_data_science/covid_19_data_stats/data/raw/COVID-19/processed/COVID_final_set.csv',sep=';',index=False)
print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())

       index_x       date  country   confirmed  people_vaccinated  \
69073    69073 2022-07-14  Germany  29569943.0         64717617.0   
69074    69074 2022-07-15  Germany  29692989.0         64718873.0   
69075    69075 2022-07-16  Germany  29692989.0         64719502.0   
69076    69076 2022-07-17  Germany  29692989.0         64719652.0   
69077    69077 2022-07-18  Germany  29853680.0         64720193.0   

       confirmed_filtered  index_y  confirmed_DR  confirmed_filtered_DR  
69073          29544854.0    69073    224.914146             249.481353  
69074          29621831.8    69074    254.141047             329.136490  
69075          29700518.0    69075    481.965666             380.594116  
69076          29757265.4    69076           inf             438.490966  
69077          29814012.8    69077    370.232965             524.381124  


### 4 Visual Board


In [None]:
# %load src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('/Users/pawnesh/ws/covid_data_science/covid_19_data_stats/data/raw/COVID-19/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['United States', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)

/Users/pawnesh/ws/covid_data_science/covid_19_data_stats/notebooks
Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on
