In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from nixtamalai.helpers import update_OxCGRT_tests, hampel

# Preprocess function

Pack the preprocess workflow into a single function. The usual preprocess is:

- Update data and merge with tests
- Add CountryID
- Add NewCases 
- Handle missing data in NewCases
- Handle missing data in NPIs
- Handle missing data in Tests
- Fix outliers
- Return only relevant columns

In [None]:
# get updated data merged with tests
df = update_OxCGRT_tests()
# Add GeoID
df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)
# Add NewCases
df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)
# Missing data in NewCases
df.update(df.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))
# Missing data in Tests
tests_columns = [c for c in df.columns if c.startswith('tests')]
for column in tests_columns:
    df.update(df.groupby('GeoID')[column].apply(
    lambda group: group.interpolate()).fillna(0))
# Missing data in NPIs assuming they are the same as previous day
npi_cols = ['C1_School closing',
            'C2_Workplace closing',
            'C3_Cancel public events',
            'C4_Restrictions on gatherings',
            'C5_Close public transport',
            'C6_Stay at home requirements',
            'C7_Restrictions on internal movement',
            'C8_International travel controls',
            'H1_Public information campaigns',
            'H2_Testing policy',
            'H3_Contact tracing',
            'H6_Facial Coverings']
for npi_col in npi_cols:
    df.update(df.groupby('GeoID')[npi_col].ffill().fillna(0))
# Hampel filter (default values)
filtered = df.groupby('CountryCode').apply(lambda group: hampel(group.NewCases))
filtered = filtered.reset_index()[['NewCases']]
filtered.columns = ['NewCasesHampel']
df = df.join(filtered)
id_cols = ['CountryName',
           'CountryCode',
           'RegionName',
           'GeoID',
           'Date']
cases_col = ['NewCases']
df = df [id_cols + cases_col + npi_cols + tests_columns]
df

Let's merge this with OWD data

In [None]:
owd = pd.read_csv("../data_sources/owd_by_country.csv").drop('Unnamed: 0', axis=1)
df = df.merge(owd, on='CountryCode', how='left')
df

Pack it all in a function.

In [None]:
def preprocess(k=7, threshold=3, merge_owd=True):
    # get updated data merged with tests
    df = update_OxCGRT_tests()
    # Add GeoID
    df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)
    # Add NewCases
    df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)
    # Missing data in NewCases
    df.update(df.groupby('GeoID').NewCases.apply(
        lambda group: group.interpolate()).fillna(0))
    # Missing data in Tests
    tests_columns = [c for c in df.columns if c.startswith('tests')]
    for column in tests_columns:
        df.update(df.groupby('GeoID')[column].apply(
        lambda group: group.interpolate()).fillna(0))
    # Missing data in NPIs assuming they are the same as previous day
    npi_cols = ['C1_School closing',
                'C2_Workplace closing',
                'C3_Cancel public events',
                'C4_Restrictions on gatherings',
                'C5_Close public transport',
                'C6_Stay at home requirements',
                'C7_Restrictions on internal movement',
                'C8_International travel controls',
                'H1_Public information campaigns',
                'H2_Testing policy',
                'H3_Contact tracing',
                'H6_Facial Coverings']
    for npi_col in npi_cols:
        df.update(df.groupby('GeoID')[npi_col].ffill().fillna(0))
    # Hampel filter (default values)
    filtered = df.groupby('CountryCode').apply(lambda group: hampel(group.NewCases, k, threshold))
    filtered = filtered.reset_index()[['NewCases']]
    filtered.columns = ['NewCasesHampel']
    df = df.join(filtered)
    id_cols = ['CountryName',
               'CountryCode',
               'RegionName',
               'GeoID',
               'Date']
    cases_col = ['NewCases']
    df = df [id_cols + cases_col + npi_cols + tests_columns]
    if merge_owd:
        owd = pd.read_csv("../data_sources/owd_by_country.csv").drop('Unnamed: 0', axis=1)
        df = df.merge(owd, on='CountryCode', how='left')
    return df

In [None]:
df = preprocess()
df