In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Data
The data were drawn from https://ourworldindata.org/ which were in turn drawn from various official Greek sources.
Their coronavirus dataset is freely available in https://github.com/owid/covid-19-data/tree/master/public/data.

In [None]:
!wget https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv

--2020-12-20 14:56:08--  https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 

# Take a look at the dataset

In [None]:
path = "owid-covid-data.csv"
df = pd.read_csv(path)
df.columns

In [None]:
gr = df.loc[ df["location"] == "Greece" ]

In [None]:
with pd.option_context('display.max_rows', 50, 'display.max_columns', 60):
    display(gr)

In [None]:
with pd.option_context('display.max_rows', 50, 'display.max_columns', 60):
    display(gr[["location", "date", "new_tests", "total_tests", "new_cases_smoothed"]]) #need display to show the dataframe when using with in jupyter

In [None]:
inf_gr = gr[["date", "new_tests", "total_tests", "new_cases"]].copy()

# Inference
There appear to be missing data both in the "new_tests" columns and in the "total_tests" columns. We use the "total_tests" column to infer "new_tests". Between two dates that do have a report of the cumulative number of tests, we assume a uniform distribution of the difference of the tests between those two dates.

In [None]:
num_tests = [] 
k = 0
last = 0
s = 0
for total in inf_gr["total_tests"]:
    if np.isnan(total):
        k += 1
    else:
        k += 1
        new = []
        new = [ (total - last) /(k) ] * (k)
        s += ((total - last) /(k)) * k
        num_tests.extend(new)
        last = total
        k = 0
if k != 0:
    num_tests.extend([np.nan]*k)

In [None]:
len(inf_gr["new_tests"])

In [None]:
inf_gr["new_tests"] = num_tests

In [None]:
import numpy as np
from scipy.signal import find_peaks

tests = np.array(num_tests)
argpeaks, _ = find_peaks(tests, height=30000)
peaks = tests[argpeaks]
print("Extreme peaks were found for the dates:")
print(inf_gr["date"].array[argpeaks])

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
g = sns.lineplot(data=inf_gr, x="date", y="new_tests")
g.set(xticks=[ i for i in range(len(inf_gr)) if i%30==0] )
g.scatter(argpeaks, peaks, marker='x', color='orange', s=100)
plt.title("Infered number of tests each day & \n7 day rolling average of tests each day")

inf_gr ["7d_rolling_tests"] = inf_gr.new_tests.rolling(7).mean().shift(-3)

g = sns.lineplot(data=inf_gr, x="date", y="7d_rolling_tests", color='red', ax=ax)
g.legend(["new tests each day", "rolling 7 day average", "peaks"])

In [None]:
inf_gr ["7d_rolling_cases"] = inf_gr.new_cases.rolling(7).mean().shift(-3)
plt.figure(figsize=(15,7))

g = sns.lineplot(data=inf_gr, x="date", y="7d_rolling_cases")
g.set(xticks=[ i for i in range(len(inf_gr)) if i%30==0] )
plt.title("7 day rolling average of new cases each day")
g.legend(["rolling 7 day average new cases"])

In [None]:
inf_gr["cases_tests_ratio"] = np.array(inf_gr["new_cases"].array) / np.array(inf_gr["new_tests"].array)


fig, ax = plt.subplots(figsize=(15,7))
g = sns.lineplot(data=inf_gr, x="date", y="cases_tests_ratio")
g.set(xticks=[ i for i in range(len(inf_gr)) if i%30==0] )
plt.title("Infered ratio of cases to new tests each day & \n7 day rolling average of the c-to-t ratio")

inf_gr ["7d_rolling_ctr"] = inf_gr.cases_tests_ratio.rolling(7).mean().shift(-3)

g = sns.lineplot(data=inf_gr, x="date", y="7d_rolling_ctr", color='red', ax=ax)
g.legend(["daily cases-to-tests ratio", "rolling 7 day average of c-t-r"])

# Visualizing curves with different range
In order to visualize the curves of "new cases" and "new tests" and the curves of "7 day rolling average of cases" and "7 day rolling average of tests" in a way that would make comparison rather easy, the latter curve is scaled by means of normalizing the given curve and then multiplying the normalized values with the standard deviation of the former curve and adding its mean value.

In [None]:
temp = np.array(inf_gr["new_cases"].array) / np.array(inf_gr["new_tests"].array)
temp = (temp - np.nanmean(temp)) / np.nanstd(temp)
temp = temp * np.nanstd(np.array(inf_gr["new_tests"].array)  )+ np.nanmean(np.array(inf_gr["new_tests"].array) )
inf_gr["scaled_ctr"] = temp
                                                                   
plt.figure(figsize=(15,7))
fig, ax = plt.subplots(figsize=(15,7))
g = sns.lineplot(data=inf_gr, x="date", y="new_tests", ax=ax)
g = sns.lineplot(data=inf_gr, x="date", y="scaled_ctr", ax=ax)
# g.plot(temp)
g.set(xticks=[ i for i in range(len(inf_gr)) if i%30==0] )
g.legend(["New Tests", "Scaled case to test ratio"])
plt.title("Scaled ratio of tests to new cases each day over new daily new cases")

## moving average
temp = np.array(inf_gr["7d_rolling_cases"].array) / np.array(inf_gr["7d_rolling_tests"].array)
temp = (temp - np.nanmean(temp)) / np.nanstd(temp)
temp = temp * np.nanstd(np.array(inf_gr["7d_rolling_tests"].array)  )+ np.nanmean(np.array(inf_gr["7d_rolling_tests"].array) )
inf_gr["7d_rolling_scaled_ctr"] = temp
                                                                   
fig, ax = plt.subplots(figsize=(15,7))
g = sns.lineplot(data=inf_gr, x="date", y="7d_rolling_tests", ax=ax)
g = sns.lineplot(data=inf_gr, x="date", y="7d_rolling_scaled_ctr", ax=ax)

g.set(xticks=[ i for i in range(len(inf_gr)) if i%30==0] )
g.legend(["7d m.a. New Tests", "7d m.a. Scaled case to test ratio"])
plt.title("Scaled ratio of tests to new cases each day over new daily new cases")