In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm

### Read the data

In [2]:
daily_confirmes_cases_url = \
    "https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
daily_testing_url = \
    "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-all-observations.csv"

daily_confirmed_cases = pd.read_csv(daily_confirmes_cases_url).drop(columns=["Lat", "Long"]).melt(
    id_vars=["Province/State", "Country/Region"],
    var_name="Date",
    value_name="Confirmed cases"
)

daily_confirmed_cases["Date"] = pd.to_datetime(daily_confirmed_cases.Date)
daily_confirmed_cases.loc[daily_confirmed_cases["Province/State"] == "Hong Kong", "Country/Region"] = "Hong Kong"
daily_confirmed_cases = daily_confirmed_cases.groupby(
                            ["Country/Region", "Date"]
                        ).sum().reset_index()

daily_testing = pd.read_csv(daily_testing_url)
daily_testing["Date"] = pd.to_datetime(daily_testing.Date)
daily_testing = daily_testing.join(
    pd.DataFrame(np.array(daily_testing.Entity.str.split(" - ").to_list()), columns=["Country/Region", "Type"])
)

cases_and_testing = daily_confirmed_cases.merge(
    daily_testing,
    on=["Date", "Country/Region"]
)
cases_and_testing = cases_and_testing.sort_values(by=["Country/Region", "Date"])

cases_and_testing["Daily confirmed cases"] = 0
for country_region in cases_and_testing["Country/Region"].unique():
    cases_and_testing.loc[cases_and_testing["Country/Region"] == country_region, "Daily confirmed cases"] = \
        cases_and_testing.loc[cases_and_testing["Country/Region"] == country_region, "Confirmed cases"].diff()
    # Gaussian smoothing
    cases_and_testing.loc[cases_and_testing["Country/Region"] == country_region, "Daily confirmed cases"].rolling(
        7, win_type='gaussian').mean(std=3).dropna().round()

tests_per_confirmed_case = pd.read_csv("../data/number-of-covid-19-tests-per-confirmed-case.csv")
tests_per_confirmed_case.Date = pd.to_datetime(tests_per_confirmed_case.Date)
tests_per_confirmed_case["Country/Region"] = tests_per_confirmed_case["Entity"]

cases_and_testing = cases_and_testing.merge(tests_per_confirmed_case[
    ["Country/Region", "Date", "Tests per confirmed case – total (tests per confirmed case)"]],
                        on=["Country/Region", "Date"])

global_population = pd.read_csv("../data/global_population_numbers.csv", sep=";")
global_population["Population"] = global_population["Most Recent Value (Thousands)"].str.replace(",", "").astype(float)*1000
global_population["Country/Region"] = global_population["Country"]

cases_and_testing = cases_and_testing.merge(global_population[["Country/Region", "Population"]], on="Country/Region")

cases_and_testing["Daily confirmed cases per million"] = \
    (cases_and_testing["Daily confirmed cases"] / cases_and_testing["Population"] * 1000000).fillna(0)
    
cases_and_testing.head(5)

Unnamed: 0,Country/Region,Date,Confirmed cases,Entity,ISO code,Source URL,Source label,Notes,Cumulative total,Daily change in cumulative total,Cumulative total per thousand,Daily change in cumulative total per thousand,7-day smoothed daily change,7-day smoothed daily change per thousand,Type,Daily confirmed cases,Tests per confirmed case – total (tests per confirmed case),Population,Daily confirmed cases per million
0,Argentina,2020-04-08,1715,Argentina - tests performed,ARG,https://www.argentina.gob.ar/sites/default/fil...,Government of Argentina,,13330.0,,0.295,,,,tests performed,,7.773,44494500.0,0.0
1,Argentina,2020-04-09,1795,Argentina - tests performed,ARG,https://www.argentina.gob.ar/sites/default/fil...,Government of Argentina,,14850.0,1520.0,0.329,0.034,,,tests performed,80.0,8.273,44494500.0,1.797975
2,Argentina,2020-04-10,1975,Argentina - tests performed,ARG,https://www.argentina.gob.ar/sites/default/fil...,Government of Argentina,,16379.0,1529.0,0.362,0.034,,,tests performed,180.0,8.648,44494500.0,4.045444
3,Argentina,2020-04-11,1975,Argentina - tests performed,ARG,https://www.argentina.gob.ar/sites/default/fil...,Government of Argentina,,18027.0,1648.0,0.399,0.036,,,tests performed,0.0,9.128,44494500.0,0.0
4,Argentina,2020-04-13,2208,Argentina - tests performed,ARG,https://www.argentina.gob.ar/sites/default/fil...,Government of Argentina,,19758.0,,0.437,,,,tests performed,66.0,8.969,44494500.0,1.483329


In [4]:
X = cases_and_testing[["Daily change in cumulative total per thousand",
                       "Tests per confirmed case – total (tests per confirmed case)"]].fillna(0)
X = sm.add_constant(X)

model = sm.OLS(cases_and_testing["Daily confirmed cases per million"].fillna(0), X)
res = model.fit()

In [5]:
print(res.summary())

                                    OLS Regression Results                                   
Dep. Variable:     Daily confirmed cases per million   R-squared:                       0.170
Model:                                           OLS   Adj. R-squared:                  0.169
Method:                                Least Squares   F-statistic:                     452.7
Date:                               Tue, 26 May 2020   Prob (F-statistic):          1.20e-179
Time:                                       10:38:14   Log-Likelihood:                -22723.
No. Observations:                               4439   AIC:                         4.545e+04
Df Residuals:                                   4436   BIC:                         4.547e+04
Df Model:                                          2                                         
Covariance Type:                           nonrobust                                         
                                                            