In [207]:
# so do an xgboost or random forest and see which type of policy has the greatest impact
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_log_error

from datetime import datetime
from datetime import timedelta

from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings; warnings.simplefilter('ignore')

In [208]:
policy = pd.read_csv('policy_for_analysis.csv')
policy.drop(columns = ['Introduced by'], inplace =True)
policy.head()

Unnamed: 0,Date,Policy,Category,Deemed_sig_by_tracker
0,2020-01-10,Guidance published for health care providers o...,Infection prevention and control,1
1,2020-01-10,Guidance published for clinicians on the inves...,Health and social care settings,1
2,2020-01-10,Public Health England offers advice for travel...,Travel measures,0
3,2020-01-16,An interim recommendation is made to classify ...,Other,0
4,2020-01-21,Guidance published for primary care profession...,Health and social care settings,0


In [209]:
df = pd.read_csv('df_utla.csv')
df.drop(columns = ['Unnamed: 0'], inplace =True)
# drop my cruidly made Restrictions
df = df.drop(columns = 'Restrictions')

# daily per 100,000
df['case_rate_on_day'] = df['Daily_lab-confirmed_cases'] / (df['Population']/100000)

# add days since first infection
df['date'] = pd.to_datetime(df.date)
day_1 = min(df['date'].unique())
df['days_since_first'] = (df['date'] - day_1)
df['days_since_first'] = df['days_since_first'].astype(str)
df['days_since_first'] = df['days_since_first'].str.replace(' days', '')
df['days_since_first'] = df['days_since_first'].astype(int)

In [210]:
# want to display case rate for 2 weeks in the future as the current features will effect case rate 
# means I can't look at two most recent weeks as wont have case rates for it...
df['2_weeks_future_date'] = df['date'] + timedelta(days=14)
temp = df[['date', 'case_rate_on_day', 'Area_code']]
temp.rename(columns = {'case_rate_on_day':'case_rate_in_2_weeks', 'date':'date2'}, inplace = True)
df = pd.merge(df, temp, how='left', left_on=('2_weeks_future_date', 'Area_code'), right_on=('date2', 'Area_code'))

# df = shuffle(df)
# confirming that join was successful
# df[(df['date']== '2020-07-27') & (df['Area_name']=='Warwickshire')]
#df[(df['date']== '2020-08-10') & (df['Area_name']=='Warwickshire')]
# drop NA for case_rate_in_2_weeks Column
df = df[df['case_rate_in_2_weeks'].notna()]

In [211]:
# now need to add in policy info
# i think I will one hot encode to have a each unique date with policy totals based on category

# one hot encode
policy_encoded = pd.get_dummies(policy, columns = ['Category'], drop_first=True)
policy_encoded


# aggregate for date... i.e. each date will have a number of policies for each cat and num of sig policies for the day

policy_encoded = policy_encoded.groupby(by = 'Date').sum().reset_index()
policy_encoded

Unnamed: 0,Date,Deemed_sig_by_tracker,Category_Health and social care settings,Category_Infection prevention and control,Category_Informing the public,Category_Legal change,Category_Legal change.1,"Category_Monitoring, Testing",Category_Non-health and social care settings,Category_Non-health and social care settings.1,...,Category_SAGE papers,Category_Shielding measures,Category_Shielding measures.1,Category_Social distancing measures,Category_Social distancing measures.1,"Category_Testing, surveillance and contact tracing","Category_Testing, surveillance and contact tracing.1",Category_Travel measures,Category_Travel measures.1,Category_Vaccine
0,2020-01-10,2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2020-01-16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-01-21,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-01-22,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2020-01-28,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,2020-10-01,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
144,2020-10-08,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
145,2020-10-12,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
146,2020-10-14,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [212]:
# merge policy with df

df = df.drop(columns = ['date2'])
policy_encoded['Date'] = pd.to_datetime(policy_encoded['Date'])
df = pd.merge(df, policy_encoded, how='left', left_on=('date'), right_on=('Date'))
df = df.drop(columns = ['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33503 entries, 0 to 33502
Data columns (total 34 columns):
 #   Column                                               Non-Null Count  Dtype         
---  ------                                               --------------  -----         
 0   Area_name                                            33503 non-null  object        
 1   Area_code                                            33503 non-null  object        
 2   Area_type                                            33503 non-null  object        
 3   date                                                 33503 non-null  datetime64[ns]
 4   Daily_lab-confirmed_cases                            33503 non-null  int64         
 5   ConfirmedCases                                       33503 non-null  int64         
 6   Cumulative_lab-confirmed_cases_rate                  33503 non-null  float64       
 7   id                                                   33503 non-null  int64         
 

In [213]:
# fill na with zeros if have prefix Category columns and Deemed_sig_by_tracker

Cat_columns = df.filter(regex='^Category_',axis=1)
Cat_columns = Cat_columns.fillna(0)
df.filter(regex='^Category_',axis=1).fillna(0)

df.fillna(0, inplace=True)
df.isna().sum()

Area_name                                              0
Area_code                                              0
Area_type                                              0
date                                                   0
Daily_lab-confirmed_cases                              0
ConfirmedCases                                         0
Cumulative_lab-confirmed_cases_rate                    0
id                                                     0
Population                                             0
case_rate_on_day                                       0
days_since_first                                       0
2_weeks_future_date                                    0
case_rate_in_2_weeks                                   0
Deemed_sig_by_tracker                                  0
Category_Health and social care settings               0
Category_Infection prevention and control              0
Category_Informing the public                          0
Category_Legal change          

In [214]:
df

Unnamed: 0,Area_name,Area_code,Area_type,date,Daily_lab-confirmed_cases,ConfirmedCases,Cumulative_lab-confirmed_cases_rate,id,Population,case_rate_on_day,...,Category_SAGE papers,Category_Shielding measures,Category_Shielding measures.1,Category_Social distancing measures,Category_Social distancing measures.1,"Category_Testing, surveillance and contact tracing","Category_Testing, surveillance and contact tracing.1",Category_Travel measures,Category_Travel measures.1,Category_Vaccine
0,York,E06000014,utla,2020-01-30,1,1,0.5,112403,210618.0,0.474793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,York,E06000014,utla,2020-01-31,0,1,0.5,112402,210618.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,York,E06000014,utla,2020-02-01,0,1,0.5,112401,210618.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,York,E06000014,utla,2020-02-02,0,1,0.5,112400,210618.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,York,E06000014,utla,2020-02-03,0,1,0.5,112399,210618.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33498,Wakefield,E08000036,utla,2020-10-14,163,4698,1348.8,108308,348312.0,46.797124,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
33499,Gloucestershire,E10000013,utla,2020-10-14,64,3217,505.0,87103,637070.0,10.045992,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
33500,Oldham,E08000004,utla,2020-10-14,192,6410,2703.4,98590,237110.0,80.975075,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
33501,Norfolk,E10000020,utla,2020-10-14,78,4611,508.0,96215,907760.0,8.592580,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [215]:
df.to_csv('cases_analysis.csv')

In [216]:
# one hot encode Area_name
df_encoded = pd.get_dummies(df, prefix = 'utla', columns =  ['Area_name'], drop_first=True)

In [217]:
df_encoded

Unnamed: 0,Area_code,Area_type,date,Daily_lab-confirmed_cases,ConfirmedCases,Cumulative_lab-confirmed_cases_rate,id,Population,case_rate_on_day,days_since_first,...,utla_West Sussex,utla_Westminster,utla_Wigan,utla_Wiltshire,utla_Windsor and Maidenhead,utla_Wirral,utla_Wokingham,utla_Wolverhampton,utla_Worcestershire,utla_York
0,E06000014,utla,2020-01-30,1,1,0.5,112403,210618.0,0.474793,0,...,0,0,0,0,0,0,0,0,0,1
1,E06000014,utla,2020-01-31,0,1,0.5,112402,210618.0,0.000000,1,...,0,0,0,0,0,0,0,0,0,1
2,E06000014,utla,2020-02-01,0,1,0.5,112401,210618.0,0.000000,2,...,0,0,0,0,0,0,0,0,0,1
3,E06000014,utla,2020-02-02,0,1,0.5,112400,210618.0,0.000000,3,...,0,0,0,0,0,0,0,0,0,1
4,E06000014,utla,2020-02-03,0,1,0.5,112399,210618.0,0.000000,4,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33498,E08000036,utla,2020-10-14,163,4698,1348.8,108308,348312.0,46.797124,258,...,0,0,0,0,0,0,0,0,0,0
33499,E10000013,utla,2020-10-14,64,3217,505.0,87103,637070.0,10.045992,258,...,0,0,0,0,0,0,0,0,0,0
33500,E08000004,utla,2020-10-14,192,6410,2703.4,98590,237110.0,80.975075,258,...,0,0,0,0,0,0,0,0,0,0
33501,E10000020,utla,2020-10-14,78,4611,508.0,96215,907760.0,8.592580,258,...,0,0,0,0,0,0,0,0,0,0


In [218]:
df_encoded.groupby('date')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014F08EF88C8>

In [219]:
#df = pd.concat([df,pd.get_dummies(df['Area_name'], prefix='ps')],axis=1)

In [220]:
df = df.set_index(['date'])
df

Unnamed: 0_level_0,Area_name,Area_code,Area_type,Daily_lab-confirmed_cases,ConfirmedCases,Cumulative_lab-confirmed_cases_rate,id,Population,case_rate_on_day,days_since_first,...,Category_SAGE papers,Category_Shielding measures,Category_Shielding measures,Category_Social distancing measures,Category_Social distancing measures,"Category_Testing, surveillance and contact tracing","Category_Testing, surveillance and contact tracing",Category_Travel measures,Category_Travel measures,Category_Vaccine
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-30,York,E06000014,utla,1,1,0.5,112403,210618.0,0.474793,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-31,York,E06000014,utla,0,1,0.5,112402,210618.0,0.000000,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-01,York,E06000014,utla,0,1,0.5,112401,210618.0,0.000000,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-02,York,E06000014,utla,0,1,0.5,112400,210618.0,0.000000,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-03,York,E06000014,utla,0,1,0.5,112399,210618.0,0.000000,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-14,Wakefield,E08000036,utla,163,4698,1348.8,108308,348312.0,46.797124,258,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-14,Gloucestershire,E10000013,utla,64,3217,505.0,87103,637070.0,10.045992,258,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-14,Oldham,E08000004,utla,192,6410,2703.4,98590,237110.0,80.975075,258,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-14,Norfolk,E10000020,utla,78,4611,508.0,96215,907760.0,8.592580,258,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [221]:
def create_time_features(df):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X


In [222]:
create_time_features(df)

Unnamed: 0_level_0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-30,0,3,1,1,2020,30,30,5
2020-01-31,0,4,1,1,2020,31,31,5
2020-02-01,0,5,1,2,2020,32,1,5
2020-02-02,0,6,1,2,2020,33,2,5
2020-02-03,0,0,1,2,2020,34,3,6
...,...,...,...,...,...,...,...,...
2020-10-14,0,2,4,10,2020,288,14,42
2020-10-14,0,2,4,10,2020,288,14,42
2020-10-14,0,2,4,10,2020,288,14,42
2020-10-14,0,2,4,10,2020,288,14,42


In [223]:
df= pd.get_dummies(df, prefix = 'utla', columns =  ['Area_name'], drop_first=True)


In [228]:
df.drop("date", axis=1, inplace=True) #### maybe maybe not
df = df.reset_index()
df.head()

Unnamed: 0,date,Area_code,Area_type,Daily_lab-confirmed_cases,ConfirmedCases,Cumulative_lab-confirmed_cases_rate,id,Population,case_rate_on_day,days_since_first,...,utla_West Sussex,utla_Westminster,utla_Wigan,utla_Wiltshire,utla_Windsor and Maidenhead,utla_Wirral,utla_Wokingham,utla_Wolverhampton,utla_Worcestershire,utla_York
0,2020-01-30,E06000014,utla,1,1,0.5,112403,210618.0,0.474793,0,...,0,0,0,0,0,0,0,0,0,1
1,2020-01-31,E06000014,utla,0,1,0.5,112402,210618.0,0.0,1,...,0,0,0,0,0,0,0,0,0,1
2,2020-02-01,E06000014,utla,0,1,0.5,112401,210618.0,0.0,2,...,0,0,0,0,0,0,0,0,0,1
3,2020-02-02,E06000014,utla,0,1,0.5,112400,210618.0,0.0,3,...,0,0,0,0,0,0,0,0,0,1
4,2020-02-03,E06000014,utla,0,1,0.5,112399,210618.0,0.0,4,...,0,0,0,0,0,0,0,0,0,1


In [229]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33503 entries, 0 to 33502
Data columns (total 189 columns):
 #   Column                                               Dtype         
---  ------                                               -----         
 0   date                                                 datetime64[ns]
 1   Area_code                                            object        
 2   Area_type                                            object        
 3   Daily_lab-confirmed_cases                            int64         
 4   ConfirmedCases                                       int64         
 5   Cumulative_lab-confirmed_cases_rate                  float64       
 6   id                                                   int64         
 7   Population                                           float64       
 8   case_rate_on_day                                     float64       
 9   days_since_first                                     int32         
 10  2_weeks_f

In [240]:
df_main = df.copy()
# drop all non usable columns
df_main = df_main.loc[:,~df_main.columns.duplicated()]
df_main = df_main.drop(columns = ['date', 'Area_code','Area_type','id','2_weeks_future_date'])
df_main

Unnamed: 0,Daily_lab-confirmed_cases,ConfirmedCases,Cumulative_lab-confirmed_cases_rate,Population,case_rate_on_day,days_since_first,case_rate_in_2_weeks,Deemed_sig_by_tracker,Category_Health and social care settings,Category_Infection prevention and control,...,utla_West Sussex,utla_Westminster,utla_Wigan,utla_Wiltshire,utla_Windsor and Maidenhead,utla_Wirral,utla_Wokingham,utla_Wolverhampton,utla_Worcestershire,utla_York
0,1,1,0.5,210618.0,0.474793,0,0.000000,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0.5,210618.0,0.000000,1,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0.5,210618.0,0.000000,2,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0.5,210618.0,0.000000,3,0.000000,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,0.5,210618.0,0.000000,4,0.000000,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33498,163,4698,1348.8,348312.0,46.797124,258,0.287099,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
33499,64,3217,505.0,637070.0,10.045992,258,0.000000,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
33500,192,6410,2703.4,237110.0,80.975075,258,0.421745,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
33501,78,4611,508.0,907760.0,8.592580,258,0.220323,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Check the datatypes, they need to all be int, float, or bool
x_train.dtypes.unique()

# there is an object .... needs to be removed

In [241]:
from sklearn.model_selection import train_test_split
# first shuffle 
df_main = shuffle(df_main)

# then train test split 
features = df_main.drop('case_rate_in_2_weeks', axis=1)
labels = df_main['case_rate_in_2_weeks']


# model is built using x_train and y_train, will be assesed wirh X_test, y_test
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

#this splits into validation and test
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)


# then do cross validation with train data 

In [242]:
#test to see if worked
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))
#so shows of data set train = 60%, validation is 20% and test is 20%

0.6
0.2
0.2


array([dtype('int64'), dtype('float64'), dtype('int32'), dtype('uint8')],
      dtype=object)

In [237]:
# xgb model
reg = xgb.XGBRegressor(n_estimators=1000)

reg.fit(X_train, y_train, verbose=True)

NameError: name 'xgb' is not defined

In [None]:
plot = plot_importance(reg, height=0.9, max_num_features=20)

In [None]:
# can use the days case rate as a feature to predict in two weeks, maybe even case rate from several days prior


In [None]:
# don't need to feature scale with decsion tree models

In [None]:
#  could be good to look at percent change instead of daily rate 
# y_train = train.groupby(["Country/Region"]).Fatalities.pct_change(periods=1)