In [38]:
# As usual, importing the libraries we need
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

In [39]:
# Clean the data, a copy-paste from exercise 03
corona_df = pd.read_csv("../data/raw/corona/be_corona.csv", sep = "\t")

with open("../data/raw/metadata/be_metadata.json", 'r') as f:
       country_metadata = json.load(f)

region_map = {country_metadata["country_metadata"][i]["covid_region_code"]: country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}
corona_df["region"] = corona_df["PROVINCE"].map(region_map)

population_map = {country_metadata["country_metadata"][i]["iso3166-2_code"]: country_metadata["country_metadata"][i]["population"] for i in range(len(country_metadata["country_metadata"]))}
corona_df["population"] = corona_df["region"].map(population_map)
corona_df["cases_pc"] = corona_df["CASES"] / corona_df["population"]

weather_df = pd.read_csv("../data/raw/weather/weather.csv", sep = "\t")

weather_df["TemperatureAboveGround"] = weather_df["TemperatureAboveGround"] - 273.15
weather_df = weather_df[weather_df["iso3166-2"].str.startswith("BE")]

df = corona_df.merge(weather_df, left_on = ["DATE", "region"], right_on = ["date", "iso3166-2"])
df = df.drop(["DATE", "PROVINCE", "region"], axis = 1)

In [40]:
# Here we import external data into the picture. I focus on different lockdown measures
# in Belgium: when they started and when they ended.
df["school_closed"] = 0
df["lockdown"] = 0
df["travel_ban"] = 0

# Data from https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Belgium#Government_response
df.loc[(df["date"] >= "2020-03-13") & (df["date"] <= "2020-05-03"), "school_closed"] = 1
df.loc[(df["date"] >= "2020-03-17") & (df["date"] <= "2020-05-03"), "lockdown"] = 1
df.loc[(df["date"] >= "2020-03-20") & (df["date"] <= "2020-05-03"), "travel_ban"] = 1

# Data from https://www.politico.eu/article/belgium-announces-second-coronavirus-lockdown/
df.loc[df["date"] >= "2020-11-02", "school_closed"] = 1
df.loc[df["date"] >= "2020-11-02", "lockdown"] = 1
df.loc[df["date"] >= "2020-11-02", "travel_ban"] = 1

# Let's also keep track of when the weekends were
df["weekend"] = (pd.to_datetime(df["date"], format = "%Y-%m-%d").dt.weekday >= 5).astype(int)

# And of various vacation days
df["holiday"] = 0
df.loc[df["date"] == "2020-04-13", "holiday"] = 1 # Easter
df.loc[df["date"] == "2020-05-01", "holiday"] = 1 # Labour
df.loc[df["date"] == "2020-05-21", "holiday"] = 1 # Ascension
df.loc[df["date"] == "2020-06-01", "holiday"] = 1 # Whit
df.loc[df["date"] == "2020-07-21", "holiday"] = 1 # National
df.loc[df["date"] == "2020-08-15", "holiday"] = 1 # Assumption
df.loc[df["date"] == "2020-11-01", "holiday"] = 1 # All Saints
df.loc[df["date"] == "2020-11-11", "holiday"] = 1 # Armistice

df

Unnamed: 0,CASES,population,cases_pc,date,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed,school_closed,lockdown,travel_ban,weekend,holiday
0,1,1857986,5.382172e-07,2020-03-01,BE-VAN,71.268604,5.654529e+06,2.377257e+06,7.028275,0.000283,6.305556,5.901631,0,0,0,1,0
1,6,1208542,4.964660e-06,2020-03-01,BE-BRU,71.161766,4.395659e+06,2.369199e+06,6.913354,0.000604,6.088235,6.268015,0,0,0,1,0
2,1,874048,1.144102e-06,2020-03-01,BE-VLI,73.492234,5.385017e+06,2.364463e+06,6.840184,0.002273,5.945087,6.277830,0,0,0,1,0
3,3,1106992,2.710047e-06,2020-03-01,BE-WLG,80.041325,4.336106e+06,2.303546e+06,4.735473,0.009123,5.766355,6.309143,0,0,0,1,0
4,1,1515064,6.600381e-07,2020-03-01,BE-VOV,75.257202,4.937132e+06,2.376949e+06,6.555772,0.000332,6.529002,6.814743,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2783,111,284638,3.899690e-04,2020-11-14,BE-WLX,86.802651,3.450874e+06,2.337415e+06,10.673603,0.000024,3.749169,3.784905,1,1,1,1,0
2784,158,494325,3.196278e-04,2020-11-14,BE-WNA,81.910109,3.525320e+06,2.372435e+06,12.064028,0.000574,4.026616,4.847877,1,1,1,1,0
2785,294,1515064,1.940512e-04,2020-11-14,BE-VOV,85.082627,1.990421e+06,2.425690e+06,13.012328,0.001301,3.213457,5.063612,1,1,1,1,0
2786,124,1146175,1.081859e-04,2020-11-14,BE-VBR,81.428593,2.899614e+06,2.418138e+06,13.214198,0.001101,3.923754,4.696823,1,1,1,1,0


In [50]:
# Replicate the analysis from exercise 04
Xs = ['RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround',
             'Totalprecipitation', 'UVIndex', 'WindSpeed']
df = sm.add_constant(df)
Xs.append("const")

In [51]:
est = sm.OLS(np.log(df["cases_pc"]), df[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.479
Model:                            OLS   Adj. R-squared:                  0.478
Method:                 Least Squares   F-statistic:                     365.5
Date:                Mon, 01 Mar 2021   Prob (F-statistic):               0.00
Time:                        11:10:32   Log-Likelihood:                -4553.4
No. Observations:                2788   AIC:                             9123.
Df Residuals:                    2780   BIC:                             9170.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
RelativeHumiditySurface   

In [52]:
# Add the new confounders
Xs.extend(["school_closed", "lockdown", "travel_ban", "weekend", "holiday"])

In [53]:
# This regression now controls also for governmental countermeasures
est = sm.OLS(np.log(df["cases_pc"]), df[Xs], hasconst = True).fit()
# Note how the R-squared went up: these variables do explain the trend in cases.
# This is actually bad news, because it means that interpreting the statistical
# power of the weather data is not as obvious as we would have thought. Also,
# careful in interpreting these coefficients at face value without real-world
# knowledge! For instance: why is there such a strong effect for weekends and
# holidays? Is the virus less contagious on weekends?
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.572
Model:                            OLS   Adj. R-squared:                  0.570
Method:                 Least Squares   F-statistic:                     309.0
Date:                Mon, 01 Mar 2021   Prob (F-statistic):               0.00
Time:                        11:10:37   Log-Likelihood:                -4280.2
No. Observations:                2788   AIC:                             8586.
Df Residuals:                    2775   BIC:                             8663.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
RelativeHumiditySurface   

In [54]:
# Here we add a "dummy" variable: a region fixed effect, identify which rows belong
# to which region. This dummy variable absorbs every possible omitted variable that
# distinguishes a region from all other regions.
regions = ["const",]

for region in set(df["iso3166-2"]):
    if region != "BE-WBR":
        df[region] = (df["iso3166-2"] == region).astype(int)
        regions.append(region)
        Xs.append(region)

In [55]:
est = sm.OLS(np.log(df["cases_pc"]), df[regions], hasconst = True).fit()
# Let's first see how regions did overall. No real differences, except maybe
# Brussels (BRU) doing poorly and East Flanders (VOV) doing well. 
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     2.591
Date:                Mon, 01 Mar 2021   Prob (F-statistic):            0.00398
Time:                        11:10:42   Log-Likelihood:                -5450.1
No. Observations:                2788   AIC:                         1.092e+04
Df Residuals:                    2777   BIC:                         1.099e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -10.0734      0.108    -93.198      0.0

In [56]:
est = sm.OLS(np.log(df["cases_pc"]), df[Xs], hasconst = True).fit()
# We don't really care about the coefficients or p-values of the dummy variables,
# but they keep fixed the actions of local governments when these differ from
# national counter-measures.
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.580
Model:                            OLS   Adj. R-squared:                  0.577
Method:                 Least Squares   F-statistic:                     173.5
Date:                Mon, 01 Mar 2021   Prob (F-statistic):               0.00
Time:                        11:10:45   Log-Likelihood:                -4254.1
No. Observations:                2788   AIC:                             8554.
Df Residuals:                    2765   BIC:                             8691.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
RelativeHumiditySurface   