# Read in xlsx

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go

DATA = os.path.join(os.path.dirname(os.getcwd()), "data", "private")
FILES = [os.path.join(DATA, ff) for ff in os.listdir(DATA) if ff.endswith("xlsx")]

for f in FILES:
    print(f.split(os.sep)[-1])

f = FILES[3]

statistic_id1098721_covid-19-confirmed-and-death-case-development-south-korea-2020.xlsx
statistic_id1102777_covid-19-daily-new-cases-south-korea-2020.xlsx
statistic_id1102818_covid-19-test-case-total-number-south-korea-2020.xlsx
statistic_id1095848_covid-19-confirmed-recovered-and-test-cases-south-korea-2020.xlsx


In [8]:
read_opts = [
    dict(
        sheet_name=1,
        index_col=0,
        skiprows=4,
        usecols=[1, 2, 3],
        names=["date", "n_positive", "n_deaths"],
    ),
    dict(
        sheet_name=1,
        skiprows=4,
        usecols=[1, 2],
        index_col=0,
        names=["date", "delta_n_positive"],
    ),
    dict(
        sheet_name=1, skiprows=4, usecols=[1, 2], index_col=0, names=["date", "n_tests"]
    ),
]

In [9]:
dfs = []
for f, kwargs in zip(FILES, read_opts):
    dfs.append(pd.read_excel(f, **kwargs))

df = dfs.pop(0)
for ddf in dfs:
    df = df.join(ddf)

df.index = pd.to_datetime(pd.Series(df.index.astype(str)).apply(lambda x: x + " 2020"))
df = df.fillna(0).astype(int).reset_index()
df.head()

Unnamed: 0,date,n_positive,n_deaths,delta_n_positive,n_tests
0,2020-01-20,1,0,1,0
1,2020-01-21,1,0,0,0
2,2020-01-22,1,0,0,0
3,2020-01-23,1,0,0,21
4,2020-01-24,2,0,1,0


In [63]:
freq = "3D"
group = df.groupby(pd.Grouper(key="date", freq="3D"))
df1 = group.agg(
    {"n_positive": "max", "n_deaths": "max", "delta_n_positive": "sum", "n_tests": "max"}
)
df1 = df1.fillna(0)
df1.head()

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df2.date,
        y=df2.n_deaths,
        mode="lines",
    )
)
fig.update_layout(yaxis_type="log", title=f"Number of deaths {freq}")
fig.show()

In [49]:
df2 = df1.copy()
df2["delta_n_tests"] = df2["n_tests"] - np.roll(df2["n_tests"], 1)
df2["delta_n_deaths"] = df2["n_deaths"] - np.roll(df2["n_deaths"], 1)
df2.loc["2020-01-20", "delta_n_tests"] = 0
df2.loc["2020-01-20", "delta_n_deaths"] = 0
df2 = df2.query("n_tests > 100").astype(int).reset_index()
df2.head()

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df2.date,
        y=df2.delta_n_tests,
        mode="lines",
    )
)
fig.update_layout(yaxis_type="log", title=f"Daily tests in interval {freq}")
fig.show()

In [50]:
df3 = df2.copy()
df3["delta_p_positive"] = df3["delta_n_positive"] / df3["delta_n_tests"]
df3["delta_p_negative"] = 1 - df3["delta_p_positive"]
df3["p_positive"] = df3["n_positive"] / df3["n_tests"]
df3["p_negative"] = 1 - df3["p_positive"]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df3.date,
        y=df3.p_positive,
        mode="lines",
        name="$P(+|T)$ average over all previous week",
    )
)
fig.add_trace(
    go.Scatter(
        x=df3.date,
        y=df3.delta_p_positive,
        mode="lines",
        name="$\Delta P(+|T)$ for new cases per week",
    )
)
fig.update_layout(yaxis_type="log", title=f"Positive test rates in interval {freq}")
fig.show()

I assume that the COVID-19 mortality rate is constant:
$$\gamma = \frac{n^\dagger(t+\Delta t)}{n_+(t)} = \text{const} \, .$$
This might be a sufficient approximation if hospitals are not yet at their limit.

The total number of deaths caused by a COVID-19 $n^\dagger(t)$ and the total number of COVID-19 infections $n_+(t)$ are functions of time. $\Delta t$ is the time after a new infection becomes lethal. As symptoms start to appear after 1-2 weeks, I assume that $\Delta t \sim 3$ weeks.

Furthermore I assume that $n^\dagger(t)$ is more accurate as not all infected people are tested. E.g., 
$$
    n_+(t) = n_{tot} P_t(+) \neq P_t(+|T) n_{tot}
$$
This follows from Bayes theorem
$$
    P_t(+) = \frac{P_t(+|T) P_t(T)}{P_t(T|+)}
$$
wher the probablity that an infected person will take a test $P_t(T|+) \neq P(T)$ (also non-infected persons will take a test). 

To estimate $P_t(T|+)$, I solve the above set of equations for constant $\gamma$. This results in
$$
    \text{const} = \gamma = \frac{1}{n_{tot}} \frac{n^\dagger(t+\Delta t)}{P_t(+|T) P_t(T)} P_t(T|+)
$$
or in other words
$$
    P_t(T|+) 
    \equiv \gamma \, n_{tot} \, \frac{P_t(+|T) P_t(T)}{n^\dagger(t+\Delta t)}
    = \gamma \, P_t(+|T)  \frac{n_{+}(t)}{n^\dagger(t+\Delta t)}
    = \gamma \, \frac{n_{+|T}(t)}{n^\dagger(t+\Delta t)}
$$
And, by assumption, $\gamma$ is independent of the geographics or sociological factors, this allows to compare testing probabilities.

In [62]:
n_total_pop = 51.269e6
df4 = df3.copy()
df4["p_test_positive"] = df4["n_positive"] / np.roll(df4["n_deaths"], -9) * 100

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df4.date,
        y=df4.p_test_positive,
        mode="lines",
        name="$P(T|+)/\gamma$",
    )
)
fig.update_layout(yaxis_type="log", title="Positive test rates")
fig.show()