# Model

## Imports

In [64]:
import os
from datetime import timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.graphics.regressionplots import plot_leverage_resid2
from statsmodels.compat import lzip
import statsmodels.stats.api as sms

In [65]:
import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
plt.rcParams['axes.unicode_minus'] = False

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Constants & Utils

In [66]:
def int2date(date):
    date = str(date)
    year = date[:4]
    month = date[4:6]
    day = date[6:]
    return year + '-' + month + '-' + day

In [67]:
# indicator_columns = [
#     'C4_Restrictions on gatherings',
#     'H6_Facial Coverings',
#     'H7_Vaccination policy'
# ]

# max_values = [4, 4, 5]

indicator_columns = [
    'C1_School closing',
    'C2_Workplace closing',
    'C3_Cancel public events',
    'C4_Restrictions on gatherings',
    'C5_Close public transport',
    'C6_Stay at home requirements',
    'C7_Restrictions on internal movement',
    'C8_International travel controls',
    'E1_Income support',
    'E2_Debt/contract relief',
    'H1_Public information campaigns',
    'H2_Testing policy',
    'H3_Contact tracing',
    'H6_Facial Coverings',
    'H7_Vaccination policy',
    'H8_Protection of elderly people'
]

max_values = [3, 3, 2, 4, 2, 3, 2, 4, 2, 2, 2, 3, 2, 4, 5, 3]

## Data Preprocessing

### Import Data

In [68]:
df = pd.read_csv('https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv')
df = df[df['CountryName'] == 'United States'][['RegionName', 'Date', 'ConfirmedCases'] + indicator_columns].dropna()
df['Date'] = pd.to_datetime(df['Date'].apply(int2date))
for col, val in zip(indicator_columns, max_values):
    for n in range(1, val + 1):
        df[col[3:] + f'_Level_{n}'] = df[col] >= n
        df[col[3:] + f'_Level_{n}'] = df[col[3:] + f'_Level_{n}'].replace({True: 1, False: 0})
df = df.drop(columns=indicator_columns)
df

Unnamed: 0,RegionName,Date,ConfirmedCases,School closing_Level_1,School closing_Level_2,School closing_Level_3,Workplace closing_Level_1,Workplace closing_Level_2,Workplace closing_Level_3,Cancel public events_Level_1,...,Facial Coverings_Level_3,Facial Coverings_Level_4,Vaccination policy_Level_1,Vaccination policy_Level_2,Vaccination policy_Level_3,Vaccination policy_Level_4,Vaccination policy_Level_5,Protection of elderly people_Level_1,Protection of elderly people_Level_2,Protection of elderly people_Level_3
181209,Alaska,2020-01-22,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181210,Alaska,2020-01-23,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181211,Alaska,2020-01-24,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181212,Alaska,2020-01-25,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181213,Alaska,2020-01-26,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217846,Wyoming,2021-12-09,112731.0,1,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,0
217847,Wyoming,2021-12-10,112862.0,1,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,0
217848,Wyoming,2021-12-11,112862.0,1,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,0
217849,Wyoming,2021-12-12,112862.0,1,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,0


### Feature Engineering

In [69]:
df['InfectionGrowth'] = np.nan
df['DailyCases'] = np.nan
df['DailyCasesMA'] = np.nan
moving_average_window = 7
delta_t = 7

region_names = df['RegionName'].drop_duplicates()
df_regions = []
for region_name in region_names:
    df_region = df[df['RegionName'] == region_name]
    for i in range(1, len(df_region)):
        df_region.iloc[i, -2] = max(df_region.iloc[i, 2] - df_region.iloc[i - 1, 2], 0) / (df_region.iloc[i, 1] - df_region.iloc[i - 1, 1]).days
    df_region = df_region[1:]
    df_region['DailyCasesMA'][(moving_average_window // 2):-(moving_average_window // 2)] = np.convolve(df_region['DailyCases'], np.ones(moving_average_window) / moving_average_window, 'valid')
    df_region = df_region[(moving_average_window // 2):-(moving_average_window // 2)]
    df_region['InfectionGrowth'][:-delta_t] = np.log(df_region['DailyCasesMA'][delta_t:].values / df_region['DailyCasesMA'][:-delta_t].values)
    df_region = df_region[:-delta_t]
    df_region = df_region[20:-60].replace([np.inf, -np.inf], np.nan)
    df_regions.append(df_region.drop(columns=['ConfirmedCases']))
df = pd.concat(df_regions)
df

Unnamed: 0,RegionName,Date,School closing_Level_1,School closing_Level_2,School closing_Level_3,Workplace closing_Level_1,Workplace closing_Level_2,Workplace closing_Level_3,Cancel public events_Level_1,Cancel public events_Level_2,...,Vaccination policy_Level_2,Vaccination policy_Level_3,Vaccination policy_Level_4,Vaccination policy_Level_5,Protection of elderly people_Level_1,Protection of elderly people_Level_2,Protection of elderly people_Level_3,InfectionGrowth,DailyCases,DailyCasesMA
181233,Alaska,2020-02-15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,,0.0,0.000000
181234,Alaska,2020-02-16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,,0.0,0.000000
181235,Alaska,2020-02-17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,,0.0,0.000000
181236,Alaska,2020-02-18,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,,0.0,0.000000
181237,Alaska,2020-02-19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217693,Wyoming,2021-07-09,0,0,0,0,0,0,1,0,...,1,1,1,1,1,0,0,1.509035,101.0,99.000000
217694,Wyoming,2021-07-10,0,0,0,0,0,0,1,0,...,1,1,1,1,1,0,0,1.672500,0.0,84.714286
217695,Wyoming,2021-07-11,0,0,0,0,0,0,1,0,...,1,1,1,1,1,0,0,1.685588,0.0,84.857143
217696,Wyoming,2021-07-12,0,0,0,0,0,0,1,0,...,1,1,1,1,1,0,0,1.618213,172.0,84.285714


In [70]:
df_model = df.drop(columns=['Date']).dropna()
df_model.to_csv(f'../Data/Preprocessed/model_{delta_t}_days_delta.csv', index=False)
df_model

Unnamed: 0,RegionName,School closing_Level_1,School closing_Level_2,School closing_Level_3,Workplace closing_Level_1,Workplace closing_Level_2,Workplace closing_Level_3,Cancel public events_Level_1,Cancel public events_Level_2,Restrictions on gatherings_Level_1,...,Vaccination policy_Level_2,Vaccination policy_Level_3,Vaccination policy_Level_4,Vaccination policy_Level_5,Protection of elderly people_Level_1,Protection of elderly people_Level_2,Protection of elderly people_Level_3,InfectionGrowth,DailyCases,DailyCasesMA
181257,Alaska,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.762174,0.0,0.142857
181258,Alaska,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.828314,0.0,0.142857
181259,Alaska,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,4.976734,0.0,0.142857
181260,Alaska,1,0,0,1,0,0,1,0,0,...,0,0,0,0,1,1,0,4.997212,1.0,0.142857
181261,Alaska,1,0,0,1,0,0,1,0,0,...,0,0,0,0,1,1,0,3.504055,0.0,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217693,Wyoming,0,0,0,0,0,0,1,0,0,...,1,1,1,1,1,0,0,1.509035,101.0,99.000000
217694,Wyoming,0,0,0,0,0,0,1,0,0,...,1,1,1,1,1,0,0,1.672500,0.0,84.714286
217695,Wyoming,0,0,0,0,0,0,1,0,0,...,1,1,1,1,1,0,0,1.685588,0.0,84.857143
217696,Wyoming,0,0,0,0,0,0,1,0,0,...,1,1,1,1,1,0,0,1.618213,172.0,84.285714
