This notebook merges different sources of data (in particular, the Johns Hopkins dataset for confirmed cases and deaths, the OXCGRT for some NPI addins and our own NPI data), and consolidates them into one merged data file. 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
### Initial imports
import logging
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as T
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

from epimodel.preprocessing.data_merger import _merge_data


%matplotlib inline

In [None]:
# name, epidemicforecasting.org code, OxCGRT code
# + lithuania
region_info = [
    ("Andorra", "AD", "AND"),
    ("Austria", "AT",  "AUT"),
    ("Albania","AL","ALB"),
    ("Bosnia and Herzegovina", "BA", "BIH"),
    ("Belgium", "BE", "BEL"),
    ("Bulgaria",  "BG", "BGR"),
    ("Switzerland", "CH", "CHE"),
    ("Czech Republic", "CZ", "CZE"),
    ("Germany", "DE", "DEU"),
    ("Denmark", "DK", "DNK"),
    ("Estonia","EE", "EST"),
    ("Spain", "ES", "ESP"),
    ("Finland", "FI", "FIN"),
    ("France", "FR", "FRA"),
    ("United Kingdom", "GB", "GBR"), 
    ("Georgia","GE","GEO"),
    ("Greece", "GR", "GRC"),
    ("Croatia", "HR", "HRV"),
    ("Hungary", "HU", "HUN"),
    ("Ireland", "IE", "IRL"),
    ("Israel", "IL", "ISR"),
    ("Iceland", "IS", "ISL"),
    ("Italy", "IT", "ITA"),
    ("Lithuania","LT","LTU"),
    ("Latvia","LV","LVA"),
    ("Malta","MT","MLT"),
    ("Morocco", "MA", "MAR"),
    ("Mexico","MX", "MEX"),
    ("Malaysia", "MY", "MYS"),
    ("Netherlands", "NL", "NLD"),
    ("Norway","NO","NOR"),
    ("New Zealand","NZ","NZL"),
    ("Poland","PL", "POL"),
    ("Portugal","PT","PRT"),
    ("Romania","RO", "ROU"),
    ("Serbia", "RS", "SRB"),
    ("Sweden", "SE", "SWE"),
    ("Singapore", "SG", "SGP"),
    ("Slovenia", "SI", "SVN"),
    ("Slovakia", "SK", "SVK"),
    ("South Africa", "ZA",  "ZAF"),
]

region_info.sort(key = lambda x: x[0])
region_names = list([x for x, _, _ in region_info])
regions_epi = list([x for _, x, _ in region_info])
regions_oxcgrt = list([x for _, _, x in region_info])

# OxCGRT
oxcgrt_cm_cols = ["H2_Testing policy", "C8_International travel controls", "C5_Close public transport", "C5_Flag", "C7_Restrictions on internal movement", "C7_Flag", "H1_Public information campaigns", "H1_Flag"]

oxcgrt_filter = [
    ("Symptomatic Testing", [(0, [2, 3])]),
    ("Travel Screen/Quarantine", [(1, [1, 2, 3, 4])]),
    ("Travel Bans", [(1, [3, 4])]),
    ("Public Transport Limited", [(2, [1, 2]), (3, [1])]),
    ("Internal Movement Limited", [(4, [1, 2]), (5, [1])]),
    ("Public Information Campaigns", [(6, [1, 2]), (7, [1])])
]

epifor_features = {
 "Mask Wearing": "Mask Wearing",
 "Gatherings <1000": "Gatherings <1000",
 "Gatherings <100": "Gatherings <100",
 "Gatherings <10": "Gatherings <10",
 "Some Businesses Suspended": "Some Businesses Suspended",
 "Most Businesses Suspended": "Most Businesses Suspended",
 "School Closure": "School Closure",
 "University Closure": "University Closure",
 "Stay Home Order": "Stay Home Order"
}


final_features = [
 "Mask Wearing",
 "Symptomatic Testing",
 "Gatherings <1000",
 "Gatherings <100",
 "Gatherings <10",
 "Some Businesses Suspended",
 "Most Businesses Suspended",
 "School Closure",
 "University Closure",
 "Stay Home Order",
 "Travel Screen/Quarantine",
 "Travel Bans",
 "Public Transport Limited", 
 "Internal Movement Limited", 
 "Public Information Campaigns",
]

In [None]:
_merge_data("../data", region_info, oxcgrt_filter, oxcgrt_cm_cols, epifor_features,
                final_features, johnhop_fname='john-hop_updated.csv', oxcgrt_fname='OxCGRT_latest_2810.csv', output_name='data_final_0111.csv', episet_fname='double_entry_0111.csv', start_date='2020-01-22')