In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('../data/uk_combined.csv')

# First Look

In [4]:
df.head(2)

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,M1_Wildcard,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,2020-01-31,2.0,2.0,,,,,0.029,0.029,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0,,0.0,,0.0,,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
1,2020-02-01,2.0,0.0,,,,,0.029,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0,,0.0,,0.0,,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0


How much data do we have?

In [5]:
df.shape

(472, 99)

What kind of features do we have?

In [6]:
df.columns

Index(['date', 'total_cases', 'new_cases', 'new_cases_smoothed',
       'total_deaths', 'new_deaths', 'new_deaths_smoothed',
       'total_cases_per_million', 'new_cases_per_million',
       'new_cases_smoothed_per_million', 'total_deaths_per_million',
       'new_deaths_per_million', 'new_deaths_smoothed_per_million',
       'reproduction_rate', 'icu_patients', 'icu_patients_per_million',
       'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
       'new_vaccinations_smoothed', 'total_vaccinations_per_hundred',
       'people_vaccinated_per_hundred

The description for these features are available in the codebooks for OWID and OxCGRT. Copying them below:

| column                                | description                                                                                                                                                                                                                                                                                                                                       |
|:--------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| date                                  | Date of observation                                                                                                                                                                                                                                                                                                                               |
| total_cases                           | Total confirmed cases of COVID-19                                                                                                                                                                                                                                                                                                                 |
| new_cases                             | New confirmed cases of COVID-19                                                                                                                                                                                                                                                                                                                   |
| new_cases_smoothed                    | New confirmed cases of COVID-19 (7-day smoothed)                                                                                                                                                                                                                                                                                                  |
| total_deaths                          | Total deaths attributed to COVID-19                                                                                                                                                                                                                                                                                                               |
| new_deaths                            | New deaths attributed to COVID-19                                                                                                                                                                                                                                                                                                                 |
| new_deaths_smoothed                   | New deaths attributed to COVID-19 (7-day smoothed)                                                                                                                                                                                                                                                                                                |
| total_cases_per_million               | Total confirmed cases of COVID-19 per 1,000,000 people                                                                                                                                                                                                                                                                                            |
| new_cases_per_million                 | New confirmed cases of COVID-19 per 1,000,000 people                                                                                                                                                                                                                                                                                              |
| new_cases_smoothed_per_million        | New confirmed cases of COVID-19 (7-day smoothed) per 1,000,000 people                                                                                                                                                                                                                                                                             |
| total_deaths_per_million              | Total deaths attributed to COVID-19 per 1,000,000 people                                                                                                                                                                                                                                                                                          |
| new_deaths_per_million                | New deaths attributed to COVID-19 per 1,000,000 people                                                                                                                                                                                                                                                                                            |
| new_deaths_smoothed_per_million       | New deaths attributed to COVID-19 (7-day smoothed) per 1,000,000 people                                                                                                                                                                                                                                                                           |
| reproduction_rate                     | Real-time estimate of the effective reproduction rate (R) of COVID-19. See https://github.com/crondonm/TrackingR/tree/main/Estimates-Database                                                                                                                                                                                                     |
| icu_patients                          | Number of COVID-19 patients in intensive care units (ICUs) on a given day                                                                                                                                                                                                                                                                         |
| icu_patients_per_million              | Number of COVID-19 patients in intensive care units (ICUs) on a given day per 1,000,000 people                                                                                                                                                                                                                                                    |
| hosp_patients                         | Number of COVID-19 patients in hospital on a given day                                                                                                                                                                                                                                                                                            |
| hosp_patients_per_million             | Number of COVID-19 patients in hospital on a given day per 1,000,000 people                                                                                                                                                                                                                                                                       |
| weekly_icu_admissions                 | Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week                                                                                                                                                                                                                                                         |
| weekly_icu_admissions_per_million     | Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week per 1,000,000 people                                                                                                                                                                                                                                    |
| weekly_hosp_admissions                | Number of COVID-19 patients newly admitted to hospitals in a given week                                                                                                                                                                                                                                                                           |
| weekly_hosp_admissions_per_million    | Number of COVID-19 patients newly admitted to hospitals in a given week per 1,000,000 people                                                                                                                                                                                                                                                      |
| total_tests                           | Total tests for COVID-19                                                                                                                                                                                                                                                                                                                          |
| new_tests                             | New tests for COVID-19 (only calculated for consecutive days)                                                                                                                                                                                                                                                                                     |
| total_tests_per_thousand              | Total tests for COVID-19 per 1,000 people                                                                                                                                                                                                                                                                                                         |
| new_tests_per_thousand                | New tests for COVID-19 per 1,000 people                                                                                                                                                                                                                                                                                                           |
| new_tests_smoothed                    | New tests for COVID-19 (7-day smoothed). For countries that don't report testing data on a daily basis, we assume that testing changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window                              |
| new_tests_smoothed_per_thousand       | New tests for COVID-19 (7-day smoothed) per 1,000 people                                                                                                                                                                                                                                                                                          |
| positive_rate                         | The share of COVID-19 tests that are positive, given as a rolling 7-day average (this is the inverse of tests_per_case)                                                                                                                                                                                                                           |
| tests_per_case                        | Tests conducted per new confirmed case of COVID-19, given as a rolling 7-day average (this is the inverse of positive_rate)                                                                                                                                                                                                                       |
| tests_units                           | Units used by the location to report its testing data                                                                                                                                                                                                                                                                                             |
| total_vaccinations                    | Total number of COVID-19 vaccination doses administered                                                                                                                                                                                                                                                                                           |
| people_vaccinated                     | Total number of people who received at least one vaccine dose                                                                                                                                                                                                                                                                                     |
| people_fully_vaccinated               | Total number of people who received all doses prescribed by the vaccination protocol                                                                                                                                                                                                                                                              |
| new_vaccinations                      | New COVID-19 vaccination doses administered (only calculated for consecutive days)                                                                                                                                                                                                                                                                |
| new_vaccinations_smoothed             | New COVID-19 vaccination doses administered (7-day smoothed). For countries that don't report vaccination data on a daily basis, we assume that vaccination changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window |
| total_vaccinations_per_hundred        | Total number of COVID-19 vaccination doses administered per 100 people in the total population                                                                                                                                                                                                                                                    |
| people_vaccinated_per_hundred         | Total number of people who received at least one vaccine dose per 100 people in the total population                                                                                                                                                                                                                                              |
| people_fully_vaccinated_per_hundred   | Total number of people who received all doses prescribed by the vaccination protocol per 100 people in the total population                                                                                                                                                                                                                       |
| new_vaccinations_smoothed_per_million | New COVID-19 vaccination doses administered (7-day smoothed) per 1,000,000 people in the total population                                                                                                                                                                                                                                         |
| stringency_index                      | Government Response Stringency Index: composite measure based on 9 response indicators including school closures, workplace closures, and travel bans, rescaled to a value from 0 to 100 (100 = strictest response)                                                                                                                               |
| population                            | Population in 2020                                                                                                                                                                                                                                                                                                                                |
| population_density                    | Number of people divided by land area, measured in square kilometers, most recent year available                                                                                                                                                                                                                                                  |
| median_age                            | Median age of the population, UN projection for 2020                                                                                                                                                                                                                                                                                              |
| aged_65_older                         | Share of the population that is 65 years and older, most recent year available                                                                                                                                                                                                                                                                    |
| aged_70_older                         | Share of the population that is 70 years and older in 2015                                                                                                                                                                                                                                                                                        |
| gdp_per_capita                        | Gross domestic product at purchasing power parity (constant 2011 international dollars), most recent year available                                                                                                                                                                                                                               |
| extreme_poverty                       | Share of the population living in extreme poverty, most recent year available since 2010                                                                                                                                                                                                                                                          |
| cardiovasc_death_rate                 | Death rate from cardiovascular disease in 2017 (annual number of deaths per 100,000 people)                                                                                                                                                                                                                                                       |
| diabetes_prevalence                   | Diabetes prevalence (% of population aged 20 to 79) in 2017                                                                                                                                                                                                                                                                                       |
| female_smokers                        | Share of women who smoke, most recent year available                                                                                                                                                                                                                                                                                              |
| male_smokers                          | Share of men who smoke, most recent year available                                                                                                                                                                                                                                                                                                |
| handwashing_facilities                | Share of the population with basic handwashing facilities on premises, most recent year available                                                                                                                                                                                                                                                 |
| hospital_beds_per_thousand            | Hospital beds per 1,000 people, most recent year available since 2010                                                                                                                                                                                                                                                                             |
| life_expectancy                       | Life expectancy at birth in 2019                                                                                                                                                                                                                                                                                                                  |
| human_development_index               | A composite index measuring average achievement in three basic dimensions of human development—a long and healthy life, knowledge and a decent standard of living. Values for 2019, imported from http://hdr.undp.org/en/indicators/137506                                                                                                        |


- C - containment and closure policies
- E - economic policies
- H - health system policies
- M - miscellaneous policies
Most indicators are recorded on an ordinal scale that represents the level of strictness of the policy. Four of the indicators (E3, E4, H4 and H5) are recorded as a US dollar value of fiscal spending.

Government coronavirus policies often vary by region within countries. We code the most stringent government policy that is in place in a country/territory, as represented by the highest ordinal value. Sometimes the most stringent policy in a country/territory will only apply to a small part of the population. If the most stringent policy is only present in a limited geographic area or sector (eg perhaps only one state has implemented policies at a high level), we use a binary flag variable to denote this limited scope. Ten of the indicators (C1-C7, H1, H6 and H8) have a flag for whether they are "targeted" to a specific geographical region (flag=0) or whether they are a "general" policy that is applied across the whole country/territory (flag=1).) E1 has a flag to describe whether income support is for just formal sector workers (flag=0) or whether it includes informal workers as well (flag=1). H7 has a flag to describe whether vaccine policy is funded at cost to the individual (flag=0) or by government (flag=1).

| ID | Name | Description | Measurement | Coding |
| --- | --- | --- | --- | --- |
| C1 | `C1_School closing` | Record closings of schools and universities | Ordinal scale | 0 - no measures <br/>1 - recommend closing or all schools open with alterations resulting in significant differences compared to non-Covid-19 operations <br/>2 - require closing (only some levels or categories, eg just high school, or just public schools) <br/>3 - require closing all levels <br/>Blank - no data |
| | `C1_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C2 | `C2_Workplace closing` | Record closings of workplaces | Ordinal scale | 0 - no measures <br/>1 - recommend closing (or recommend work from home) <br/>2 - require closing (or work from home) for some sectors or categories of workers <br/>3 - require closing (or work from home) for all-but-essential workplaces (eg grocery stores, doctors) <br/>Blank - no data |
| | `C2_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C3 | `C3_Cancel public events` | Record cancelling public events | Ordinal scale | 0 - no measures <br/>1 - recommend cancelling <br/>2 - require cancelling <br/>Blank - no data |
| | `C3_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C4 | `C4_Restrictions on gatherings` | Record limits on gatherings | Ordinal scale | 0 - no restrictions <br/>1 - restrictions on very large gatherings (the limit is above 1000 people) <br/>2 - restrictions on gatherings between 101-1000 people <br/>3 - restrictions on gatherings between 11-100 people <br/>4 - restrictions on gatherings of 10 people or less <br/>Blank - no data |
| | `C4_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C5 | `C5_Close public transport` | Record closing of public transport | Ordinal scale | 0 - no measures <br/>1 - recommend closing (or significantly reduce volume/route/means of transport available) <br/>2 - require closing (or prohibit most citizens from using it) <br/>Blank - no data |
| | `C5_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C6 | `C6_Stay at home requirements` | Record orders to "shelter-in-place" and otherwise confine to the home | Ordinal scale | 0 - no measures <br/>1 - recommend not leaving house <br/>2 - require not leaving house with exceptions for daily exercise, grocery shopping, and 'essential' trips <br/>3 - require not leaving house with minimal exceptions (eg allowed to leave once a week, or only one person can leave at a time, etc) <br/>Blank - no data |
| | `C6_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C7 | `C7_Restrictions on internal movement` | Record restrictions on internal movement between cities/regions | Ordinal scale | 0 - no measures <br/>1 - recommend not to travel between regions/cities <br/>2 - internal movement restrictions in place <br/>Blank - no data |
| | `C7_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |
| C8 | `C8_International travel controls` | Record restrictions on international travel <br/><br/>Note: this records policy for foreign travellers, not citizens | Ordinal scale | 0 - no restrictions <br/>1 - screening arrivals <br/>2 - quarantine arrivals from some or all regions <br/>3 - ban arrivals from some regions <br/>4 - ban on all regions or total border closure <br/>Blank - no data |

### Economic policies

| ID | Name | Description | Measurement | Coding |
| --- | --- | --- | --- | --- |
| E1 | `E1_Income support` <br/>(for households) | Record if the government is providing direct cash payments to people who lose their jobs or cannot work. <br/><br/>Note: only includes payments to firms if explicitly linked to payroll/salaries | Ordinal scale | 0 - no income support <br/>1 - government is replacing less than 50% of lost salary (or if a flat sum, it is less than 50% median salary) <br/>2 - government is replacing 50% or more of lost salary (or if a flat sum, it is greater than 50% median salary) <br/>Blank - no data |
| | `E1_Flag` | | Binary flag for sectoral scope | 0 - formal sector workers only or informal sector workers only <br/>1 - all workers
| E2 | `E2_Debt/contract relief` <br/>(for households) | Record if the government is freezing financial obligations for households (eg stopping loan repayments, preventing services like water from stopping, or banning evictions) | Ordinal scale | 0 - no debt/contract relief <br/>1 - narrow relief, specific to one kind of contract <br/>2 - broad debt/contract relief |
| E3 | `E3_Fiscal measures` | Announced economic stimulus spending <br/><br/>Note: only record amount additional to previously announced spending | USD | Record monetary value in USD of fiscal stimuli, includes any spending or tax cuts NOT included in E4, H4 or H5 <br/>0 - no new spending that day <br/>Blank - no data |
| E4 | `E4_International support` | Announced offers of Covid-19 related aid spending to other countries <br/><br/>Note: only record amount additional to previously announced spending | USD | Record monetary value in USD <br/>0 - no new spending that day <br/>Blank - no data |

### Health system policies

| ID | Name | Description | Measurement | Coding |
| --- | --- | --- | --- | --- |
| H1 | `H1_Public information campaigns` | Record presence of public info campaigns | Ordinal scale | 0 - no Covid-19 public information campaign <br/>1 - public officials urging caution about Covid-19 <br/>2- coordinated public information campaign (eg across traditional and social media) <br/>Blank - no data |
| | `H1_Flag` | | Binary flag for geographic scope |  0 - targeted <br/>1- general <br/>Blank - no data |
| H2 | `H2_Testing policy` | Record government policy on who has access to testing <br/><br/>Note: this records policies about testing for current infection (PCR tests) not testing for immunity (antibody test) | Ordinal scale | 0 - no testing policy <br/>1 - only those who both (a) have symptoms AND (b) meet specific criteria (eg key workers, admitted to hospital, came into contact with a known case, returned from overseas) <br/>2 - testing of anyone showing Covid-19 symptoms <br/>3 - open public testing (eg "drive through" testing available to asymptomatic people) <br/>Blank - no data |
| H3 | `H3_Contact tracing` | Record government policy on contact tracing after a positive diagnosis <br/><br/>Note: we are looking for policies that would identify all people potentially exposed to Covid-19; voluntary bluetooth apps are unlikely to achieve this | Ordinal scale | 0 - no contact tracing <br/>1 - limited contact tracing; not done for all cases <br/>2 - comprehensive contact tracing; done for all identified cases |
| H4 | `H4_Emergency investment in healthcare` | Announced short term spending on healthcare system, eg hospitals, masks, etc <br/><br/>Note: only record amount additional to previously announced spending | USD | Record monetary value in USD <br/>0 - no new spending that day <br/>Blank - no data |
| H5 | `H5_Investment in vaccines` | Announced public spending on Covid-19 vaccine development <br/><br/>Note: only record amount additional to previously announced spending | USD | Record monetary value in USD <br/>0 - no new spending that day <br/>Blank - no data |
| H6 | `H6_Facial Coverings` | Record policies on the use of facial coverings outside the home <br/> | Ordinal scale | 0 - No policy <br/>1 - Recommended <br/>2 - Required in some specified shared/public spaces outside the home with other people present, or some situations when social distancing not possible <br/>3 - Required in all shared/public spaces outside the home with other people present or all situations when social distancing not possible <br/>4 - Required outside the home at all times regardless of location or presence of other people |
| | `H6_Flag` | | Binary flag for geographic scope |  0 - targeted <br/>1- general <br/>Blank - no data |
| H7 | `H7_Vaccination Policy` | Record policies for vaccine delivery for different groups <br/> | Ordinal scale | 0 - No availability <br/>1 - Availability for ONE of following: key workers/ clinically vulnerable groups (non elderly) / elderly groups <br/>2 - Availability for TWO of following: key workers/ clinically vulnerable groups (non elderly) / elderly groups <br/>3 - Availability for ALL of following: key workers/ clinically vulnerable groups (non elderly) / elderly groups <br/>4 - Availability for all three plus partial additional availability (select broad groups/ages) <br/>5 - Universal availability | 
| | `H7_Flag` | | Binary flag for cost |  0 - At cost to individual (or funded by NGO, insurance, or partially government funded) <br/>1- No or minimal cost to individual (government funded or subsidised) <br/>Blank - no data |
| H8 | `H8_Protection of elderly people` | Record policies for protecting elderly people (as defined locally) in Long Term Care Facilities and/or the community and home setting | Ordinal scale | 0 - no measures <br/>1 - Recommended isolation, hygiene, and visitor restriction measures in LTCFs and/or elderly  people to stay at home <br/>2 - Narrow restrictions for isolation, hygiene in LTCFs, some limitations on external visitors and/or restrictions protecting elderly people at home <br/>3 - Extensive restrictions for isolation and hygiene in LTCFs, all non-essential external visitors prohibited, and/or all elderly people required to stay at home and not leave the home with minimal exceptions, and receive no external visitors <br/>Blank - no data | 
| | `H8_Flag` | | Binary flag for geographic scope | 0 - targeted <br/>1- general <br/>Blank - no data |


### Miscellaneous policies

| ID | Name | Description | Measurement | Coding |
| --- | --- | --- | --- | --- |
| M1 | `M1_Wildcard` | Record policy announcements that do not fit anywhere else | Free text notes field | Note unusual or interesting interventions that are worth flagging  |


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472 entries, 0 to 471
Data columns (total 99 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   date                                   472 non-null    object 
 1   total_cases                            441 non-null    float64
 2   new_cases                              441 non-null    float64
 3   new_cases_smoothed                     436 non-null    float64
 4   total_deaths                           406 non-null    float64
 5   new_deaths                             406 non-null    float64
 6   new_deaths_smoothed                    436 non-null    float64
 7   total_cases_per_million                441 non-null    float64
 8   new_cases_per_million                  441 non-null    float64
 9   new_cases_smoothed_per_million         436 non-null    float64
 10  total_deaths_per_million               406 non-null    float64
 11  new_de

Setting correct type for `date`

In [8]:
df['date'] = pd.to_datetime(df['date'])

Sorting based on the date and reseting index

In [9]:
df.sort_values(by='date', inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_cases,441.0,1332177.0,1569738.0,2.0,239579.0,352453.0,2262739.0,4396096.0
new_cases,441.0,9968.472,13464.47,-4787.0,1031.0,4007.0,15579.0,68192.0
new_cases_smoothed,436.0,10064.27,13311.26,0.571,1017.928,4373.215,15801.14,59828.57
total_deaths,406.0,57400.85,36920.11,1.0,39510.0,42008.0,75444.5,127438.0
new_deaths,406.0,313.8867,372.8368,0.0,31.0,155.0,501.75,1826.0
new_deaths_smoothed,436.0,292.1032,334.7118,0.0,22.0,145.2145,454.8568,1253.0
total_cases_per_million,441.0,19623.74,23123.14,0.029,3529.137,5191.836,33331.45,64757.03
new_cases_per_million,441.0,146.8414,198.3394,-70.515,15.187,59.025,229.488,1004.507
new_cases_smoothed_per_million,436.0,148.2526,196.0826,0.008,14.9945,64.42,232.7597,881.309
total_deaths_per_million,406.0,845.5477,543.8545,0.015,582.005,618.802,1111.341,1877.235


# Dropping some date ranges

If we take a look at the full dataset, we will see that the initial 2-3 months have incomplete data (the disease hadnt progressed, there wasnt much testing being done).
The last week or so is also incomplete.
We shall take only the data from 31 Jan 2020 to 6 April 2021 (both ends inclusive) and drop the rest.

Lets check what the index values are for these dates

In [11]:
df[(df['date'] == '2020-01-01') | (df['date'] == '2020-01-30') | (df['date'] == '2021-04-07')]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,M1_Wildcard,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,2020-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,2020-01-30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,0.0,,0.0,,,5.56,5.56,9.52,9.52,5.21,5.21,5.95,5.95,0.0,0.0
462,2021-04-07,4381830.0,2797.0,3121.143,127171.0,45.0,30.857,64546.884,41.201,45.976,1873.302,0.663,0.455,0.73,440.0,6.481,,,,,,,1034088.0,129798698.0,1912.01,15.233,807357.0,11.893,0.004,258.7,tests performed,37899029.0,31807124.0,6091905.0,507926.0,319732.0,55.83,46.85,8.97,4710.0,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,2.0,0.0,3.0,0.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,3.0,2.0,1.0,2.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,0.0,3.0,1.0,4.0,1.0,2.0,1.0,,75.93,75.93,76.19,76.19,76.35,76.35,72.98,72.98,100.0,100.0


In [12]:
df.tail(1)

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,M1_Wildcard,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
471,2021-04-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,66.2,,76.19,,70.89,,66.73,,100.0


In [13]:
# Dropping 0-29 and 462-471
df.drop(labels=range(0, 30) , inplace=True)
df.drop(labels=range(462, 472) , inplace=True)

In [14]:
df.head(2)

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,M1_Wildcard,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
30,2020-01-31,2.0,2.0,,,,,0.029,0.029,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0,,0.0,,0.0,,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
31,2020-02-01,2.0,0.0,,,,,0.029,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0,,0.0,,0.0,,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0


In [15]:
df.tail(2)

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,M1_Wildcard,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
460,2021-04-05,4376629.0,2831.0,3547.571,127106.0,28.0,35.571,64470.27,41.702,52.258,1872.345,0.412,0.524,0.69,461.0,6.791,3239.0,47.712,,,,,741323.0,128000297.0,1885.518,10.92,843988.0,12.432,0.004,237.9,tests performed,37119083.0,31622367.0,5496716.0,105334.0,369827.0,54.68,46.58,8.1,5448.0,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,2.0,0.0,3.0,0.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,3.0,2.0,1.0,2.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,0.0,3.0,1.0,4.0,1.0,2.0,1.0,,75.93,75.93,76.19,76.19,76.35,76.35,72.98,72.98,100.0,100.0
461,2021-04-06,4379033.0,2404.0,3309.429,127126.0,20.0,30.571,64505.682,35.412,48.75,1872.639,0.295,0.45,0.71,453.0,6.673,3124.0,46.018,,,,,631846.0,128712320.0,1896.007,9.307,841159.0,12.391,0.004,254.2,tests performed,37391103.0,31707594.0,5683509.0,272020.0,336744.0,55.08,46.71,8.37,4960.0,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932,2.0,0.0,3.0,0.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,3.0,2.0,1.0,2.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,0.0,3.0,1.0,4.0,1.0,2.0,1.0,,75.93,75.93,76.19,76.19,76.35,76.35,72.98,72.98,100.0,100.0


In [16]:
# index got messed up, reset it
df.reset_index(drop=True, inplace=True)

# Separating Repetitive Data

Here we find those features which have the same value in all rows, excluding null values.

In [17]:
rep_idxs = df.nunique()[df.nunique() == 1].index.tolist()
for i in rep_idxs:
    vc = df[i].value_counts(dropna=False)
    if len(vc) > 1:
        print(vc)
        print('-'*20)

tests performed    372
NaN                 60
Name: tests_units, dtype: int64
--------------------
1.0    383
NaN     49
Name: C5_Flag, dtype: int64
--------------------
NaN    312
1.0    120
Name: H7_Flag, dtype: int64
--------------------


We drop `tests_units` as that is a string field and it is identical for this dataset.
We can fill in values for the other 2 later.

In [18]:
df.drop('tests_units', axis=1, inplace=True)

# Dropping Nulls

Lets look at all the nulls

In [19]:
null_vars = df.isna().sum(axis=0)
null_vars[null_vars >= 1].sort_values()

E3_Fiscal measures                         1
H5_Investment in vaccines                  1
new_cases_smoothed                         5
new_deaths_smoothed_per_million            5
H4_Emergency investment in healthcare      5
new_deaths_smoothed                        5
new_cases_smoothed_per_million             5
reproduction_rate                         32
total_deaths_per_million                  35
new_deaths_per_million                    35
total_deaths                              35
new_deaths                                35
C2_Flag                                   45
C3_Flag                                   46
C6_Flag                                   47
C1_Flag                                   47
C5_Flag                                   49
E1_Flag                                   49
C7_Flag                                   51
H8_Flag                                   52
C4_Flag                                   52
hosp_patients                             56
hosp_patie

Some features here like `H5_Investment in vaccines`  and `H4_Emergency investment in healthcare` are particularly usefull, as they are 0 on most instances (because the govt makes big investments on particular days when the govt has a meeting). It doesn't suffice to just know how much they invested, what we really need is how many hospital beds/ ventilators were added (we have data for this, but its a one time count made some time ago, its the same for all the instances) so the hospital investment thing is useless. Also investing in vaccine wont affect the disease, only the number of people who take the vaccination will.

Hence we shall drop these.

In [20]:
df.drop(['H5_Investment in vaccines', 'H4_Emergency investment in healthcare'], axis=1, inplace=True)

In [21]:
# adjusting nulls list
null_vars = df.isna().sum(axis=0)

In [22]:
all_null_features = df.isna().sum(axis=0)[df.isna().sum(axis=0) == len(df)].index.tolist()
all_null_features

['weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'handwashing_facilities',
 'M1_Wildcard']

In [23]:
df.drop(all_null_features, axis=1, inplace=True)

We will also drop  a few values that we cant impute because they are statistically computed from some other values ie they are an estimate based on the data (`reproduction_number`) and some others because they contain weekly data

In [24]:
df.drop(['weekly_hosp_admissions', 'weekly_hosp_admissions_per_million',
         'reproduction_rate'], axis=1, inplace=True)

# Imputing Values

In [25]:
null_vars[null_vars >= 1].sort_values()

E3_Fiscal measures                         1
new_deaths_smoothed_per_million            5
new_cases_smoothed_per_million             5
new_cases_smoothed                         5
new_deaths_smoothed                        5
reproduction_rate                         32
new_deaths                                35
total_deaths_per_million                  35
new_deaths_per_million                    35
total_deaths                              35
C2_Flag                                   45
C3_Flag                                   46
C6_Flag                                   47
C1_Flag                                   47
E1_Flag                                   49
C5_Flag                                   49
C7_Flag                                   51
C4_Flag                                   52
H8_Flag                                   52
hosp_patients                             56
hosp_patients_per_million                 56
new_tests                                 60
total_test

Some of the death related features have null values initially, we can fill these with 0 because the first death was registered on 6 Mar 2020

## Death Related Features

In [26]:
df[df['total_deaths'] == 1]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
35,2020-03-06,374.0,79.0,45.429,1.0,1.0,0.143,5.509,1.164,0.669,0.015,0.015,0.002,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0


In [27]:
df[df['total_deaths'].isnull()]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,2020-01-31,2.0,2.0,,,,,0.029,0.029,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,,0.0,,0.0,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
1,2020-02-01,2.0,0.0,,,,,0.029,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,,0.0,,0.0,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
2,2020-02-02,2.0,0.0,,,,,0.029,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
3,2020-02-03,8.0,6.0,,,,,0.118,0.088,,,,,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
4,2020-02-04,8.0,0.0,,,,,0.118,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
5,2020-02-05,9.0,1.0,1.286,,,0.0,0.133,0.015,0.019,,,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
6,2020-02-06,9.0,0.0,1.286,,,0.0,0.133,0.0,0.019,,,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
7,2020-02-07,9.0,0.0,1.0,,,0.0,0.133,0.0,0.015,,,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
8,2020-02-08,13.0,4.0,1.571,,,0.0,0.191,0.059,0.023,,,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
9,2020-02-09,14.0,1.0,1.714,,,0.0,0.206,0.015,0.025,,,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0


We can see above that a lot of the death related features during the initial months are null. But the data also tells us that the first death was reported on 6 Mar 2020. So it should be safe(and logical) to fill the related features with 0 till that point as far as this source of data is concerned

In [28]:
df.fillna(value = {'total_deaths': 0, 'new_deaths':0, 'new_deaths_smoothed':0,
                   'total_deaths_per_million':0, 'new_deaths_per_million':0,
                   'new_deaths_smoothed_per_million':0}, inplace=True)

## Flag Features

In [29]:
null_flags = [i for i in null_vars.index if i.endswith('Flag')]
df[null_flags].isnull().sum(axis=0)

C1_Flag     47
C2_Flag     45
C3_Flag     46
C4_Flag     52
C5_Flag     49
C6_Flag     47
C7_Flag     51
E1_Flag     49
H1_Flag      0
H6_Flag     88
H7_Flag    312
H8_Flag     52
dtype: int64

In [30]:
df[df[null_flags].isnull().any(axis=1)].head()

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,2020-01-31,2.0,2.0,,0.0,0.0,0.0,0.029,0.029,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,,0.0,,0.0,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
1,2020-02-01,2.0,0.0,,0.0,0.0,0.0,0.029,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,,0.0,,0.0,,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
2,2020-02-02,2.0,0.0,,0.0,0.0,0.0,0.029,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
3,2020-02-03,8.0,6.0,,0.0,0.0,0.0,0.118,0.088,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
4,2020-02-04,8.0,0.0,,0.0,0.0,0.0,0.118,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,,0.0,,0.0,,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0


Lets tackle `H7_Flag` first. Its relates to who will bear the cost of the vaccination. Since the vaccine only became available to the public on 8th Dec 2020 (according to the [news](https://www.independent.co.uk/news/health/covid-vaccine-uk-coronavirus-b1767802.html))  in the UK, all the people who may have gotten the vaccine before are healthcare workers or vaccine trial volunteers. Surely for these people the government or the pharma firm would have given the vaccine free of charge, so we can fill the null values with 1 (No or minimal cost to individual (government funded or subsidised)). Lets check the date for the last null entry:

In [31]:
df.loc[[df[df['H7_Flag'].isnull()].index.max()]]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
311,2020-12-07,1742528.0,14774.0,15541.714,61531.0,189.0,426.571,25668.443,217.63,228.938,906.387,2.784,6.284,1271.0,18.723,16412.0,241.758,215981.0,43279583.0,637.533,3.182,327126.0,4.819,0.048,21.0,,,,,,,,,,63.89,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,1.0,1.0,2.0,1.0,2.0,1.0,4.0,1.0,1.0,1.0,0.0,,2.0,0.0,2.0,2.0,1.0,2.0,5340000.0,0.0,2.0,1.0,2.0,1.0,3.0,1.0,0.0,,3.0,0.0,63.89,63.89,72.62,72.62,65.62,65.62,60.71,60.71,100.0,100.0


As expected, it is null till the day vaccinations began. So we can fill all the previous dates with national level specificity (ie 1)

In [32]:
df.fillna(value={'H7_Flag': 1}, inplace=True)

`H6_Flag` next. It relates to the region specificity of the mask policy. For the early days, the value of `H6_Facial Coverings` is 0, which means there was no policy. So we can set the flag to 1(general) for all the null values. First lets check the last null:

In [33]:
df.loc[[df[df['H6_Flag'].isnull()].index.max()]]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
87,2020-04-27,163630.0,4704.0,4624.286,24376.0,323.0,755.143,2410.364,69.293,68.118,359.073,4.758,11.124,2460.0,36.237,16662.0,245.441,31453.0,726553.0,10.703,0.463,27761.0,0.409,0.167,6.0,,,,,,,,,,79.63,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,3.0,1.0,3.0,1.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,0.0,2.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,,0.0,1.0,3.0,1.0,79.63,79.63,80.95,80.95,65.62,65.62,60.71,60.71,100.0,100.0


As expected the last null is in the early months of the disease, when the government was less cautious.

In [34]:
df.fillna(value={'H6_Flag': 1}, inplace=True)

We can carry out a similar argument for `H8_Flag` (relates to protection of the elderly) and `E1_Flag` (income support) and fill all the nulls with 1 (general policy).

In [35]:
df.loc[[df[df['H8_Flag'].isnull()].index.max()]]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
51,2020-03-22,10395.0,1388.0,1045.429,289.0,35.0,35.143,153.124,20.446,15.4,4.257,0.516,0.518,,,,,,,,,,,,,,,,,,,,,,62.04,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,3.0,0.0,3.0,1.0,2.0,1.0,0.0,,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,62.04,62.04,72.62,72.62,47.92,47.92,42.26,42.26,87.5,87.5


In [36]:
df.loc[[df[df['E1_Flag'].isnull()].index.max()]]

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
48,2020-03-19,6534.0,1066.0,677.714,164.0,46.0,22.143,96.25,15.703,9.983,2.416,0.678,0.326,,,,,,,,,,,,,,,,,,,,,,31.48,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,3.0,0.0,1.0,1.0,1.0,1.0,0.0,,0.0,,1.0,0.0,0.0,,0.0,0.0,,2.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,31.48,31.48,45.24,45.24,26.04,26.04,22.62,22.62,50.0,50.0


In [37]:
df.fillna(value={'H8_Flag': 1, 'E1_Flag':1}, inplace=True)

Now for all the flags starting with `C`. All these flags relate to the regionwise specificity of the various things relating to transportation, movement, gathering etc. Since there are only about 50 or so missing values for each of these, lets have a look at all of them:

In [38]:
c_null_flags = [i for i in null_flags if i.startswith('C')]
c_nulls = [i.split('_')[0] for i in c_null_flags]
c_null_vals = [i for i in df.columns if i.split('_')[0] in c_nulls]
df[df[c_null_flags].isnull().any(axis=1)][c_null_vals + ['date']]

Unnamed: 0,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,date
0,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-01-31
1,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-01
2,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-02
3,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-03
4,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-04
5,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-05
6,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-06
7,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-07
8,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-08
9,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,2020-02-09


We see that most of the missing values are in the initial part with the exception of `C6_Flag` which seems to be missing some values midway as well. All the restriction level values corresponding to each of the flags are null even though the flags themselves may be null. The last places where `C6_Flag` is null, the restriction levels are 0, since the data gives the highest restriction level in the region, we can assume that is level is generally applicable, so we shall fill that part with 1 (general) as well, just like the rest of the C flags which are null.

In [39]:
print(c_null_flags)

['C1_Flag', 'C2_Flag', 'C3_Flag', 'C4_Flag', 'C5_Flag', 'C6_Flag', 'C7_Flag']


In [40]:
df.fillna(value=dict(zip(c_null_flags, [1]*len(c_null_flags))), inplace=True)

Now that we are done with all the flags, lets have a look at the nulls again:

In [41]:
null_vars = df.isna().sum(axis=0)
null_vars[null_vars >= 1].sort_values()

E3_Fiscal measures                         1
new_cases_smoothed                         5
new_cases_smoothed_per_million             5
hosp_patients                             56
hosp_patients_per_million                 56
new_tests                                 60
total_tests                               60
total_tests_per_thousand                  60
new_tests_per_thousand                    60
icu_patients                              62
icu_patients_per_million                  62
tests_per_case                            67
positive_rate                             67
new_tests_smoothed                        67
new_tests_smoothed_per_thousand           67
new_vaccinations_smoothed                339
new_vaccinations_smoothed_per_million    339
total_vaccinations                       344
people_vaccinated                        344
people_fully_vaccinated                  344
total_vaccinations_per_hundred           344
people_vaccinated_per_hundred            344
people_ful

Filling in the only missing value in `E3_Fiscal measures` with 0

In [42]:
df.fillna(value={'E3_Fiscal measures': 0}, inplace=True)

Now `new_cases_smoothed` is null because the smoothing is over a 7 day window and there is not enough data within the window, we can assume that the new cases before the first report of cases were 0, and fill in the smoothed values accordingly

In [43]:
df.head(7)

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,2020-01-31,2.0,2.0,,0.0,0.0,0.0,0.029,0.029,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
1,2020-02-01,2.0,0.0,,0.0,0.0,0.0,0.029,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,8.33,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,8.33,8.33,9.52,9.52,13.02,13.02,14.88,14.88,0.0,0.0
2,2020-02-02,2.0,0.0,,0.0,0.0,0.0,0.029,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
3,2020-02-03,8.0,6.0,,0.0,0.0,0.0,0.118,0.088,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
4,2020-02-04,8.0,0.0,,0.0,0.0,0.0,0.118,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
5,2020-02-05,9.0,1.0,1.286,0.0,0.0,0.0,0.133,0.015,0.019,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
6,2020-02-06,9.0,0.0,1.286,0.0,0.0,0.0,0.133,0.0,0.019,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0


In [44]:
# filling nulls in the inital part of new_cases_smoothed
FILL_LEN = 5
df.loc[0:FILL_LEN-1, 'new_cases_smoothed'] = df['new_cases'].head(FILL_LEN).rolling(7, min_periods=1).sum()/7

In [45]:
# adjusting new_cases_smoothed_per_million accordingly
temp_df = df.loc[0:FILL_LEN-1]
df.loc[0:FILL_LEN-1,'new_cases_smoothed_per_million'] = temp_df['new_cases_smoothed'] / temp_df['population'] * 1e6

Now we know (from news reports) that UK started public vaccinations on 8 Dec 2020 and any vaccinations before that would have been on a very small scale (to vaccine trial participants and frontline health workers) So we can fill in 0 for all the vaccine related numbers till 8 Dec 2020. This will introduce fractional values, but that is unlikely to harm a model

In [46]:
# checking first valid indices
vac_vars = ['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
       'new_vaccinations_smoothed', 'total_vaccinations_per_hundred',
       'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred',
       'new_vaccinations_smoothed_per_million']
for i in vac_vars:
    print(i, '- index:', df[i].first_valid_index())

total_vaccinations - index: 338
people_vaccinated - index: 338
people_fully_vaccinated - index: 338
new_vaccinations - index: 346
new_vaccinations_smoothed - index: 339
total_vaccinations_per_hundred - index: 338
people_vaccinated_per_hundred - index: 338
people_fully_vaccinated_per_hundred - index: 338
new_vaccinations_smoothed_per_million - index: 339


In [47]:
df.iloc[336:,:][['date'] + vac_vars].head(15)

Unnamed: 0,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million
336,2021-01-01,,,,,,,,,
337,2021-01-02,,,,,,,,,
338,2021-01-03,1402432.0,1380430.0,22002.0,,,2.07,2.03,0.03,
339,2021-01-04,,,,,182220.0,,,,2684.0
340,2021-01-05,,,,,182220.0,,,,2684.0
341,2021-01-06,,,,,182220.0,,,,2684.0
342,2021-01-07,,,,,182220.0,,,,2684.0
343,2021-01-08,,,,,182220.0,,,,2684.0
344,2021-01-09,,,,,182220.0,,,,2684.0
345,2021-01-10,2677971.0,2286572.0,391399.0,,182220.0,3.94,3.37,0.58,2684.0


Above, we see that the vaccine related data is null till 2 Jan 2021, then there is again about 6 days of nulls and the rest of the data is okay. We shall consider the total vaccinations till 7 Dec 2020 as 0 and then quadratically interpolate the values between 8 Dec 2020 and 9 Jan 2021 and then between 4 Jan 2021 and 9 Jan 2021. Below we fill in the values for `'total_vaccinations','new_vaccinations_smoothed', 'total_vaccinations_per_hundred'`

In [48]:
df.loc[df['date'] <= '2020-12-07', 'total_vaccinations'] = 0
df['total_vaccinations'].interpolate(inplace=True, method='quadratic')
df.loc[:,'total_vaccinations_per_hundred'] = df['total_vaccinations'] / df['population'] * 100

Now we can fill in `new_vaccinations, new_vaccinations_smoothed, new_vaccinations_smoothed_per_million`,  based on the difference accordingly.

In [49]:
df.loc[df['date'] <= '2020-12-07', 'new_vaccinations'] = 0

start_idx = df[df['date'] == '2020-12-08']['date'].index[0]
end_idx = df[df['date'] == '2021-01-10']['date'].index[0]
for i in range(start_idx, end_idx + 1):
    df.loc[i, 'new_vaccinations'] = df.loc[i, 'total_vaccinations'] - df.loc[i-1, 'total_vaccinations']

df.loc[df['date'] <= '2020-12-07', 'new_vaccinations_smoothed'] = 0
df.loc[df['date'] <= '2020-12-07', 'new_vaccinations_smoothed_per_million'] = 0

df.loc[start_idx:end_idx, 'new_vaccinations_smoothed'] = df.loc[start_idx:end_idx, 'new_vaccinations'].rolling(7, min_periods=1).sum()/7
df.loc[start_idx:end_idx,'new_vaccinations_smoothed_per_million'] = df.loc[start_idx:end_idx, 'new_vaccinations_smoothed'] / df.loc[start_idx:end_idx, 'population'] * 1e6

Doing something similar for `people_vaccinated` and then setting the difference of `total_vaccinations` - `people_vaccinated` as `people_fully_vaccinated`. We `people_fully_vaccinated` to be taken to be 0 till the first non null entry.

In [50]:
start_idx = df[df['date'] == '2020-12-08']['date'].index[0]
end_idx = df[df['date'] == '2020-12-30']['date'].index[0]

df.loc[df['date'] <= '2020-12-07', 'people_vaccinated'] = 0
df.loc[start_idx:end_idx, 'people_vaccinated'] = df.loc[start_idx:end_idx, 'total_vaccinations']
df['people_vaccinated'].interpolate(inplace=True, method='quadratic')

df.loc[df['date'] <= '2021-01-02', 'people_fully_vaccinated'] = 0
start_idx = df[df['date'] == '2021-01-04']['date'].index[0]
end_idx = df[df['date'] == '2021-01-09']['date'].index[0]
for i in range(start_idx, end_idx + 1):
    df.loc[i, 'people_fully_vaccinated'] = df.loc[i, 'total_vaccinations'] - df.loc[i, 'people_vaccinated']
    
df.loc[:,'people_vaccinated_per_hundred'] = df.loc[:,'people_vaccinated']/df.loc[:,'population']*100
df.loc[:,'people_fully_vaccinated_per_hundred'] = df.loc[:,'people_fully_vaccinated']/df.loc[:,'population']*100

Now for the testing related data. We know that there were atleast 2 tests carried out on the first day that cases were detected. So we will fill in 2 for `total_tests` that day, extrapolate the data exponentially and then fill in the rest of the testing related data.

In [51]:
df.loc[0, 'total_tests'] = 5
df.loc[0, 'new_tests'] = 5

df['total_tests'] = np.log(df['total_tests'])
df['total_tests'].interpolate(inplace=True, method='linear')
df['total_tests'] = np.exp(df['total_tests'])
df['total_tests_per_thousand'] = df['total_tests'] / df['population'] * 1e3

start_idx = 1
end_idx = df[df['date'] == '2020-03-30']['date'].index[0]
for i in range(start_idx, end_idx + 1):
    df.loc[i, 'new_tests'] = df.loc[i, 'total_tests'] - df.loc[i-1, 'total_tests']

start_idx = 0
end_idx = df[df['date'] == '2020-04-06']['date'].index[0]    
df.loc[start_idx:end_idx, 'new_tests_smoothed'] = df.loc[start_idx:end_idx, 'new_tests'].rolling(7, min_periods=1).sum()/7

df['new_tests_per_thousand'] = df['new_tests'] / df['population'] * 1e3
df['new_tests_smoothed_per_thousand'] = df['new_tests_smoothed'] / df['population'] * 1e3

In [52]:
df.head(30).tail(10)

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H6_Flag,H7_Vaccination policy,H7_Flag,H8_Protection of elderly people,H8_Flag,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
20,2020-02-20,22.0,2.0,0.714,0.0,0.0,0.0,0.324,0.029,0.011,0.0,0.0,0.0,,,,,24.879868,157.127785,0.002315,0.000366,15.7309,0.000232,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
21,2020-02-21,23.0,1.0,0.714,0.0,0.0,0.0,0.339,0.015,0.011,0.0,0.0,0.0,,,,,29.560531,186.688316,0.00275,0.000435,18.690363,0.000275,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
22,2020-02-22,23.0,0.0,0.714,0.0,0.0,0.0,0.339,0.0,0.011,0.0,0.0,0.0,,,,,35.121768,221.810084,0.003267,0.000517,22.20659,0.000327,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
23,2020-02-23,28.0,5.0,1.429,0.0,0.0,0.0,0.412,0.074,0.021,0.0,0.0,0.0,,,,,41.729245,263.539329,0.003882,0.000615,26.384327,0.000389,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
24,2020-02-24,30.0,2.0,1.571,0.0,0.0,0.0,0.442,0.029,0.023,0.0,0.0,0.0,,,,,49.579789,313.119118,0.004612,0.00073,31.348025,0.000462,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
25,2020-02-25,34.0,4.0,2.143,0.0,0.0,0.0,0.501,0.059,0.032,0.0,0.0,0.0,,,,,58.90726,372.026378,0.00548,0.000868,37.245544,0.000549,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
26,2020-02-26,37.0,3.0,2.429,0.0,0.0,0.0,0.545,0.044,0.036,0.0,0.0,0.0,,,,,69.989513,442.015891,0.006511,0.001031,44.252568,0.000652,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
27,2020-02-27,44.0,7.0,3.143,0.0,0.0,0.0,0.648,0.103,0.046,0.0,0.0,0.0,,,,,83.156676,525.172567,0.007736,0.001225,52.577826,0.000775,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
28,2020-02-28,56.0,12.0,4.714,0.0,0.0,0.0,0.825,0.177,0.069,0.0,0.0,0.0,,,,,98.800984,623.973551,0.009191,0.001455,62.469319,0.00092,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0
29,2020-02-29,61.0,5.0,5.429,0.0,0.0,0.0,0.899,0.074,0.08,0.0,0.0,0.0,,,,,117.388464,741.362015,0.010921,0.001729,74.221704,0.001093,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,2.54,81.32,0.932,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,11.11,11.11,14.29,14.29,14.58,14.58,16.67,16.67,0.0,0.0


Filling in `tests_per_case` which according to the variable description is a rolling 7 day average over the the `number of new cases / number of new tests`.

In [53]:
temp = df['new_tests']/df['new_cases']
df['tests_per_case'] = temp.rolling(7, min_periods=1).sum()/7
df['positive_rate'] = 1/df['tests_per_case']

## Hospital and ICU data

Both of these having missing values in the first 3 months or so, but they show a high correlation with the `new_deaths_smoothed`. We think data in this first data range is important because it contains the first wave of the pandemic and might help the model learn/predict such waves.

In [54]:
# for ICU patients
first_val = df['icu_patients'].first_valid_index()
ratio_mean = (df.loc[first_val:, 'icu_patients']/df.loc[first_val:, 'new_deaths_smoothed']).mean()
df.loc[:first_val-1, 'icu_patients'] = ratio_mean * df.loc[:first_val-1, 'new_deaths_smoothed']

# for hospital patients
first_val = df['hosp_patients'].first_valid_index()
ratio_mean = (df.loc[first_val:, 'hosp_patients']/df.loc[first_val:, 'new_deaths_smoothed']).mean()
df.loc[:first_val-1, 'hosp_patients'] = ratio_mean * df.loc[:first_val-1, 'new_deaths_smoothed']

# the per million values
df['icu_patients_per_million'] = df['icu_patients'] / df['population'] * 1e6
df['hosp_patients_per_million'] = df['hosp_patients'] / df['population'] * 1e6

# Notes

The below columns still have some nulls in them (all of them right at the start), so one either drop those columns or drop those rows.

In [55]:
null_vars = df.isna().sum(axis=0)
null_vars[null_vars >= 1].sort_values()

Series([], dtype: int64)

**Note** that a large amount of data from the time period 1 Jan 2020 to 30 Mar 2020 is extrapolated based on some assumptions, if you dont want to use this data then you can drop this date range.

# Uploading Data

In [56]:
df.to_csv('../data/uk_processed.csv', index=False)