# Update data

In [5]:
import pandas as pd
import numpy as np
import missingno as mno
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime


def get_data():

    path='https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
    raw_list=pd.read_csv(path,sep=',')
    raw_list=raw_list.iloc[:,2::]
    raw_list.to_csv('../data/raw/COVID_raw',index=False)
    print(' Number of regions rows: '+str(raw_list.shape[0]))
    

if __name__ == '__main__':
    get_data()


 Number of regions rows: 204254


# delete

In [6]:

path='../data/raw/COVID_raw'
list=pd.read_csv(path,sep=',')
list=list.iloc[:,0:3]
list.tail()

Unnamed: 0,location,date,total_cases
204249,Zimbabwe,2022-07-23,256246.0
204250,Zimbabwe,2022-07-24,256254.0
204251,Zimbabwe,2022-07-25,256270.0
204252,Zimbabwe,2022-07-26,256284.0
204253,Zimbabwe,2022-07-27,256315.0


# Process Pipeline

In [7]:


def store_relational_GH_data():
    ''' Transformes the COVID data from GitHub in a relational data set
    '''

    data_path='../data/raw/COVID_raw'
    pd_raw=pd.read_csv(data_path)

   # pd_relational_model=pd.crosstab(pd_raw['date'],pd_raw['location'],values=pd_raw['total_cases'],aggfunc=np.mean)
    pd_relational_model=pd_raw.iloc[:,:3]
    pd_relational_model=pd_relational_model.rename(columns={'total_cases':'confirmed'})#.set_index('date')
    
    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',index=False)
    new_path='../data/processed/COVID_relational_confirmed.csv'
    pd_relational_model=pd.read_csv(new_path)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
    
if __name__ == '__main__':

    store_relational_GH_data()

 Number of rows stored: 204254
 Latest date is: 2022-07-27


# delete

In [8]:
#new_path='../data/processed/COVID_relational_confirmed.csv'
#pd_relational_model=pd.read_csv(new_path)
#pd_relational_model.columns

#df_result=pd_relational_model
#filter_in=pd_relational_model['confirmed'].fillna(0) # attention with the neutral element here

#result=signal.savgol_filter(np.array(filter_in),
   #                        5, # window size used for filtering
  #                         1,
 #                           mode='nearest')
#df_result[str('confirmed_filtered')]=result
#df_result.tail()

# delete

In [10]:
#df_result_test=df_result.rename(index={'':'index'})
#df_result_test.head()

# delete

# Filtering and Doubling rate calculation

In [11]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    #degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1,
                           mode='nearest')
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result

def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['location',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    #pd_filtered_result=df_output[['location',filter_on]].groupby(['location']).apply(savgol_filter)#.reset_index()
    pd_filtered_result=df_output.groupby(['location']).apply(savgol_filter)#.reset_index()


    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()

def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['location',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_doubling_rate not all columns in data frame'


    pd_DR_result= df_input.groupby(['location']).apply(rolling_reg,filter_on).reset_index()


    #pd_relational_model=pd_relational_model.rename(columns={'total_cases':'confirmed'}).set_index('date')
    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR', 'level_1':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output

if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_GH_data=pd.read_csv('../data/processed/COVID_relational_confirmed.csv')
    ##pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    ##test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
    ##                  (pd_JH_data['country']=='Germany'))]

    pd_result_larg=calc_filtered_data(pd_GH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('../data/processed/COVID_final_set.csv',index=False)
    print(pd_result_larg[pd_result_larg['location']=='Germany'].tail())


the test slope is: [2.]
      location        date   confirmed  confirmed_filtered  confirmed_DR  \
69793  Germany  2022-07-23  30331133.0          30341824.8  6.586269e+02   
69794  Germany  2022-07-24  30331133.0          30413677.4  3.033113e+07   
69795  Germany  2022-07-25  30476605.0          30487953.4  4.176697e+02   
69796  Germany  2022-07-26  30598385.0          30562229.0  2.280148e+02   
69797  Germany  2022-07-27  30702511.0          30636504.6  2.708427e+02   

       confirmed_filtered_DR  
69793             444.049693  
69794             430.659422  
69795             416.270124  
69796             410.469537  
69797             411.470644  


# Dynamic Dashboard

In [21]:
import dash
from dash import dcc
from dash import html
#from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('../data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

     dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)




ImportError: cannot import name 'dcc' from 'dash' (/Users/gulzar/opt/anaconda3/lib/python3.9/site-packages/dash/__init__.py)