# EDA Covid 19
***

In [133]:
# add index

#### Quality Notes
* Recovered not always reported consistently by countries (i.e. lower than reality for NL/BE - two countries we can observe)
* 7-Day Moving Average on daily reported numbers show a cleaner trend
* Overall comparison between countries does not make much sense given their population; best to scale to numbers per million inhavitants (as is common for these datasets)


## Introduction
***
Analyse the data provided using exploratory visuals. Key aspects:
* What is the overall status today (totals)?
* How are countries performing? (and how has covid progressed)?
* What is the total daily increase? (with moving average)
* What is the effect scaled to the population?
* What is the mortality rate?
* Metric A: per country scaling over log-scale (minutephysics plot)
* Metrics B: growth KPI assessment (Stacey Barr)

Interesting countries:
* Netherlands
* Belgium
* Italy
* Spain
* China
* Malaysia
* United Kingdom
* United States


In [134]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

from sqlalchemy import create_engine
%load_ext sql

%load_ext autoreload
%autoreload 2

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [392]:
# load bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show 
from bokeh.models import Range1d, ColumnDataSource
output_notebook()

In [393]:
# function for querying db
from src.data.query_db import queryDB
qdb = queryDB('sqlite','../data/processed/covid.sqlite')

sqlite:///../data/processed/covid.sqlite


In [403]:
# quick check
qdb.execute_query('SELECT * FROM stats LIMIT 5;')

5 rows affected


Unnamed: 0,country,date,confirmed,death,recovered
0,Afghanistan,2020-01-22,0,0,0
1,Albania,2020-01-22,0,0,0
2,Algeria,2020-01-22,0,0,0
3,Andorra,2020-01-22,0,0,0
4,Angola,2020-01-22,0,0,0


### 1. Total Numbers

In [394]:
# get overall numbers
query = """
SELECT 
    DATE(date) AS date,
    SUM(confirmed) AS confirmed,
    SUM(death) AS death,
    SUM(recovered) AS recovered
FROM stats
GROUP BY date
ORDER BY date DESC;
"""

overall = qdb.execute_query(query)

110 rows affected


In [395]:
#set source
source = ColumnDataSource(overall)

# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.line(x='date', y='confirmed', line_width=2, source=source, color='red', legend_label='confirmed')
p.line(x='date', y='death', line_width=2, source=source, color='blue', legend_label = 'death')
p.line(x='date', y='recovered', line_width=2, source=source, color='green', legend_label = 'recovered')


p.legend.location = "top_left"
p.legend.click_policy="hide"

show(p)

#### 2. Top countries
***

In [396]:
# get overall numbers
query = """
SELECT
    country, 
    confirmed,
    death,
    recovered
FROM stats
WHERE date = (SELECT MAX(date) FROM stats)
ORDER BY confirmed DESC
LIMIT 10;
"""

top10 = qdb.execute_query(query)

10 rows affected


In [397]:
#set source
source = ColumnDataSource(top10)

p = figure(y_range=top10['country'].iloc[::-1], plot_height=350, title="Top 10 countries with Covid",
           toolbar_location=None, tools="")

p.hbar(y='country', right='confirmed', source = source, height=0.8)

p.ygrid.grid_line_color = None
p.x_range.start = 0

show(p)

In [142]:
# potential visual: show how the top 10 progates per day (popular barplot)

#### 3. Daily Increment
***

In [236]:
%%sql
DROP TABLE IF EXISTS daily_stats;

CREATE TABLE daily_stats AS
SELECT
    country,
    DATE(date) AS date,
    conf_today - conf_yesterday AS confirmed,
    death_today - death_yesterday AS death,
    recov_today - recov_yesterday AS recovered,
    AVG(conf_today - conf_yesterday) OVER (PARTITION BY country ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS confirmed_ma,
    AVG(death_today - death_yesterday) OVER (PARTITION BY country ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS death_ma,
    AVG(recov_today - recov_yesterday) OVER (PARTITION BY country ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS recovered_ma
FROM (
    SELECT
        today.country,
        today.date,
        today.confirmed AS conf_today,
        today.death AS death_today,
        today.recovered AS recov_today,
        yesterday.confirmed AS conf_yesterday,
        yesterday.death AS death_yesterday,
        yesterday.recovered AS recov_yesterday
    FROM stats AS today
    JOIN stats AS yesterday
        WHERE today.country = yesterday.country AND DATE(yesterday.date) = DATE(today.date,'-1 day')) sub;

 * sqlite:///../data/processed/covid.sqlite
Done.
Done.


[]

In [250]:
%%sql
SELECT *
FROM daily_stats
WHERE country = 'Belgium';

 * sqlite:///../data/processed/covid.sqlite
Done.


country,date,confirmed,death,recovered,confirmed_ma,death_ma,recovered_ma
Belgium,2020-01-23,0,0,0,0.0,0.0,0.0
Belgium,2020-01-24,0,0,0,0.0,0.0,0.0
Belgium,2020-01-25,0,0,0,0.0,0.0,0.0
Belgium,2020-01-26,0,0,0,0.0,0.0,0.0
Belgium,2020-01-27,0,0,0,0.0,0.0,0.0
Belgium,2020-01-28,0,0,0,0.0,0.0,0.0
Belgium,2020-01-29,0,0,0,0.0,0.0,0.0
Belgium,2020-01-30,0,0,0,0.0,0.0,0.0
Belgium,2020-01-31,0,0,0,0.0,0.0,0.0
Belgium,2020-02-01,0,0,0,0.0,0.0,0.0


#### barplot with actual cases + line for MA
source: https://medium.com/y-data-stories/the-beginners-guide-to-creating-interactive-dashboards-with-python-and-bokeh-part-i-3826ea197a1b

In [404]:
res = qdb.get_daily_stats_country('United States')

109 rows affected


In [405]:
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='confirmed', source=source, color = 'red', alpha = 0.1, legend_label = 'confirmed cases')
p.line(x='date', y='confirmed_ma', line_width=2 ,source=source, color='red', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [400]:
# deaths
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='death', source=source, color = 'blue', alpha = 0.1, legend_label = 'deaths')
p.line(x='date', y='death_ma', line_width=2 ,source=source, color='blue', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [401]:
# recovered
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='recovered', source=source, color = 'green', alpha = 0.1, legend_label = 'recovered')
p.line(x='date', y='recovered_ma', line_width=2 ,source=source, color='green', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [402]:
# combined graph
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

# confirmed cases
p.vbar(x='date', width = bar_w, top='confirmed', source=source, color = 'red', alpha = 0.2, legend_label = 'confirmed cases')
p.line(x='date', y='confirmed_ma', line_width=2 ,source=source, color='red')

# deaths
p.vbar(x='date', width = bar_w, top='death', source=source, color = 'blue', alpha = 0.2, legend_label = 'deaths')
p.line(x='date', y='death_ma', line_width=2 ,source=source, color='blue')


p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [350]:
# processing: automate data gathering, cleaning and database creation

#### 4. Mortality Rate
***
* Can we predict deaths based on confirmed cases (MA) --> can this tell us (A) the lag and (B) the mortality rate
* Use: start with a correlation. Shift death by x-days and check correlation.
* Auto-regressive Model

#### 5. MinutePhysics Metric
***
Source: https://www.youtube.com/watch?v=54XLXg4fYsc

#### 6. KPI Metric
***
Source: https://www.staceybarr.com/measure-up/an-insight-from-the-covid-19-metric-growth-factor/