In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

## Merge tests data with oxford database

In [None]:
covid_tests = pd.read_csv("../data_sources/daily-tests-per-thousand-people-smoothed-7-day.csv", 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
# drop rows with null Code
covid_tests = covid_tests[covid_tests.Code.notna()]
covid_tests.info()

In [None]:
oxford = pd.read_csv("../data_sources/OxCGRT_latest.csv", 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
oxford.info()

We have to merge on two levels: country code and date, so lets index the dataframes accordingly

In [None]:
covid_tests = covid_tests.set_index(['Code', 'Date'])
covid_tests.info()

In [None]:
oxford = oxford.set_index(['CountryCode', 'Date'])

In [None]:
oxford_tests =(oxford
               .join(covid_tests.rename_axis(oxford.index.names), how='left')
               .drop(['Entity', 'new_tests_per_thousand_7day_smoothed Annotations'], axis=1)
               .rename({'new_tests_per_thousand_7day_smoothed': 'covid_tests'})
              )
oxford_tests

Of course this has lots of NaN values in the tests column, but we can find a way to deal with those later when we use it on a predictor. Fpr the time being, lets save it

In [None]:
oxford_tests.to_csv("../data_sources/OxCGRT_latest_with_tests.csv")

Now pack it up in a function 

In [None]:
def add_test_data(oxford_path, tests_path):
    covid_tests = pd.read_csv(tests_path, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
    # drop rows with null Code
    covid_tests = covid_tests[covid_tests.Code.notna()]
    covid_tests = covid_tests.set_index(['Code', 'Date'])
    oxford = pd.read_csv(oxford_path, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
    oxford = oxford.set_index(['CountryCode', 'Date'])
    oxford_tests =(oxford
                   .join(covid_tests.rename_axis(oxford.index.names), how='left')
                   .drop(['Entity', 'new_tests_per_thousand_7day_smoothed Annotations'], axis=1)
                   .rename({'new_tests_per_thousand_7day_smoothed': 'covid_tests'})
                  )
    return oxford_tests.reset_index()

In [None]:
oxford_tests = add_test_data("../data_sources/OxCGRT_latest.csv", "../data_sources/daily-tests-per-thousand-people-smoothed-7-day.csv")
oxford_tests