# EDA Covid 19
***

In [26]:
# add index

#### Quality Notes
* Recovered not always reported consistently by countries (i.e. lower than reality for NL/BE - two countries we can observe)
* 7-Day Moving Average on daily reported numbers show a cleaner trend
* Overall comparison between countries does not make much sense given their population; best to scale to numbers per million inhavitants (as is common for these datasets)


## Introduction
***
Analyse the data provided using exploratory visuals. Key aspects:
* What is the overall status today (totals)?
* How are countries performing? (and how has covid progressed)?
* What is the total daily increase? (with moving average)
* What is the effect scaled to the population?
* What is the mortality rate?
* Metric A: per country scaling over log-scale (minutephysics plot)
* Metrics B: growth KPI assessment (Stacey Barr)

Interesting countries:
* Netherlands
* Belgium
* Italy
* Spain
* China
* Malaysia
* United Kingdom
* United States


In [148]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

from sqlalchemy import create_engine
%load_ext sql

%load_ext autoreload
%autoreload 2

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [213]:
# load bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show 
from bokeh.models import Range1d, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import RdBu3

output_notebook()

In [150]:
# function for querying db
from src.data.query_db import queryDB
qdb = queryDB('sqlite','../data/processed/covid.sqlite')

sqlite:///../data/processed/covid.sqlite


In [151]:
# quick check
qdb.execute_query('SELECT * FROM stats LIMIT 5;')

5 rows affected


Unnamed: 0,country,date,confirmed,death,recovered
0,Afghanistan,2020-01-22,0,0,0
1,Albania,2020-01-22,0,0,0
2,Algeria,2020-01-22,0,0,0
3,Andorra,2020-01-22,0,0,0
4,Angola,2020-01-22,0,0,0


### 1. Total Numbers

In [6]:
# get overall numbers
query = """
SELECT 
    DATE(date) AS date,
    SUM(confirmed) AS confirmed,
    SUM(death) AS death,
    SUM(recovered) AS recovered
FROM stats
GROUP BY date
ORDER BY date DESC;
"""

overall = qdb.execute_query(query)

112 rows affected


In [7]:
#set source
source = ColumnDataSource(overall)

# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.line(x='date', y='confirmed', line_width=2, source=source, color='red', legend_label='confirmed')
p.line(x='date', y='death', line_width=2, source=source, color='blue', legend_label = 'death')
p.line(x='date', y='recovered', line_width=2, source=source, color='green', legend_label = 'recovered')


p.legend.location = "top_left"
p.legend.click_policy="hide"

show(p)

#### 2. Top countries
***

In [8]:
# barplot with top-10 countries
def plotTop10(data, vals = 'confirmed', title = 'Top 10 countries with Covid'):
    #set source
    source = ColumnDataSource(data)

    p = figure(y_range=data['country'].iloc[::-1], plot_height=350, title=title,
               toolbar_location=None, tools="")

    p.hbar(y='country', right=vals, source = source, height=0.8)

    p.ygrid.grid_line_color = None
    p.x_range.start = 0

    show(p)

In [9]:
# get overall numbers
query = """
SELECT
    country, 
    confirmed,
    death,
    recovered
FROM stats
WHERE date = (SELECT MAX(date) FROM stats)
ORDER BY confirmed DESC
LIMIT 10;
"""

top10 = qdb.execute_query(query)

10 rows affected


In [10]:
plotTop10(top10)

In [11]:
# potential visual: show how the top 10 progates per day (popular barplot)

#### 3. Daily Increment
***

In [12]:
# get overall numbers
query = """
SELECT
    country, 
    confirmed,
    death,
    recovered
FROM daily_stats
WHERE date = (SELECT MAX(date) FROM daily_stats)
ORDER BY confirmed DESC
LIMIT 10;
"""

top10_daily = qdb.execute_query(query)

10 rows affected


In [13]:
plotTop10(top10_daily, vals = 'confirmed', title = 'most cases today')

#### barplot with actual cases + line for MA
source: https://medium.com/y-data-stories/the-beginners-guide-to-creating-interactive-dashboards-with-python-and-bokeh-part-i-3826ea197a1b

In [403]:
res = qdb.get_daily_stats_country('Netherlands')

113 rows affected


In [404]:
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='confirmed', source=source, color = 'red', alpha = 0.1, legend_label = 'confirmed cases')
p.line(x='date', y='confirmed_ma', line_width=2 ,source=source, color='red', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [405]:
# deaths
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='death', source=source, color = 'blue', alpha = 0.1, legend_label = 'deaths')
p.line(x='date', y='death_ma', line_width=2 ,source=source, color='blue', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [406]:
# recovered
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='recovered', source=source, color = 'green', alpha = 0.1, legend_label = 'recovered')
p.line(x='date', y='recovered_ma', line_width=2 ,source=source, color='green', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [407]:
# combined graph
#set source
source = ColumnDataSource(res)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

# confirmed cases
p.vbar(x='date', width = bar_w, top='confirmed', source=source, color = 'red', alpha = 0.2, legend_label = 'confirmed cases')
p.line(x='date', y='confirmed_ma', line_width=2 ,source=source, color='red')

# deaths
p.vbar(x='date', width = bar_w, top='death', source=source, color = 'blue', alpha = 0.2, legend_label = 'deaths')
p.line(x='date', y='death_ma', line_width=2 ,source=source, color='blue')


p.y_range.start = 0
p.legend.location = "top_left"

show(p)

#### 4 Scale to population
***
Scraped data from: https://www.worldometers.info/world-population/population-by-country/

In [19]:
# get overall numbers
query = """
SELECT stats.country AS country,
       date,
       ROUND(confirmed/scaled_pop,2) AS conf_scaled,
       ROUND(death/scaled_pop,2) AS death_scaled,
       ROUND(recovered/scaled_pop,2) AS rec_scaled
  FROM stats
  JOIN (SELECT country, scaled_pop 
          FROM populations) AS pop
    ON stats.country = pop.country
 WHERE date = (SELECT MAX(date) 
                 FROM stats)
 ORDER BY conf_scaled DESC
 LIMIT 10;
"""

top10_scaled = qdb.execute_query(query)

10 rows affected


In [20]:
plotTop10(top10_scaled, vals = 'conf_scaled', title = 'most cases per 1M inhabitants')

#### for daily stats

In [21]:
# get overall numbers
query = """
SELECT daily_stats.country AS country,
       date,
       ROUND(confirmed/scaled_pop,2) AS conf_scaled,
       ROUND(death/scaled_pop,2) AS death_scaled,
       ROUND(recovered/scaled_pop,2) AS rec_scaled
  FROM daily_stats
  JOIN (SELECT country, 
               scaled_pop 
          FROM populations) AS pop
    ON daily_stats.country = pop.country
 WHERE date = (SELECT MAX(date) 
                 FROM daily_stats)
 ORDER BY conf_scaled DESC
 LIMIT 10;  
"""

top10_scaled_today = qdb.execute_query(query)

10 rows affected


In [22]:
plotTop10(top10_scaled_today, vals = 'conf_scaled', title = 'most cases per 1M inhabitants')

#### 5. MinutePhysics Metric
***
Source: https://www.youtube.com/watch?v=54XLXg4fYsc

Metric:
* y = new confirmed cass last week (rolling sum weeks)
* x = total confirmed cases

In [122]:
def setAxes(exp):
    """
    """
    # find the range required
    x_max = np.ceil(np.log10(exp.total_confirmed.max()))
    y_max = np.ceil(np.log10(exp.new_last_week.max()))
    
    # generate locations
    x_locs_raw = 10**np.arange(0,x_max+1,1)
    x_locs = [int(i) for i in x_locs_raw]
    y_locs_raw = 10**np.arange(0,y_max+1,1)
    y_locs = [int(i) for i in y_locs_raw]

    # generate labels
    x_labels = ['{:,.0f}'.format(v) for v in x_locs]
    y_labels = ['{:,.0f}'.format(v) for v in y_locs]

    # get required format
    x_set = dict(zip(x_locs, x_labels))
    y_set = dict(zip(y_locs, y_labels))

    return x_set, y_set

#### sample plot (mult. countries)

In [234]:
# netherlands, belgium, italy, united_states, Malaysia
query = """
SELECT *
  FROM exp_stats
 WHERE country IN ('Netherlands','Italy')
   AND total_confirmed > 25
"""

sample_exp = qdb.execute_query(query)

155 rows affected


In [235]:
#set source
source = ColumnDataSource(sample_exp)

# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type = 'log',
           y_axis_type = 'log',
           x_axis_label = 'Total confirmed cases',
           y_axis_label = 'New cases last week',
           plot_height = 400,
           plot_width = 700)

color_mapper = CategoricalColorMapper(factors=sample_exp['country'].drop_duplicates().to_list(), 
                                      palette=[RdBu3[2], RdBu3[0]])

p.circle(x='total_confirmed', y='new_last_week', line_width=2, source=source,
    color={'field': 'country', 'transform': color_mapper}, legend = 'country') 

# set axes
x_set, y_set = setAxes(exp)
p.xaxis.major_label_overrides = x_set
p.yaxis.major_label_overrides = y_set

p.legend.location = "top_left"
show(p)



#### 6. KPI Metric: XmR chart
***
Source: https://www.staceybarr.com/measure-up/an-insight-from-the-covid-19-metric-growth-factor/

Concept is to map a KPI, it's central line (avg first x measurements) and it's 'natural process limits' (normal statistical variance). Plotting these factors gives you insights in how your KPI is moving.

In [392]:
%%sql
-- calculate the daily new cases
SELECT
    date,
    confirmed AS conf
FROM daily_stats
WHERE country = 'Netherlands';

 * sqlite:///../data/processed/covid.sqlite
Done.


date,conf
2020-01-23,0
2020-01-24,0
2020-01-25,0
2020-01-26,0
2020-01-27,0
2020-01-28,0
2020-01-29,0
2020-01-30,0
2020-01-31,0
2020-02-01,0


In [410]:
res = _.DataFrame()

In [411]:
# control parameters
central_line_size = 15 # window for base-line; user defined; chosen 15 (0.5 month)
upper_range_limit = 3.27 #fixed
range_factor = 2.66 #fixed


In [412]:
# add our growth
res['growth'] = res['conf'] / res['conf'].shift(fill_value = 0)
xmr = res[res['date']>'2020-04-01'].reset_index(drop=True)
xmr['date'] = pd.to_datetime(xmr['date'])


In [415]:
# calculate the central line (average of size .. )
xmr['central_line'] = xmr['growth'].iloc[:central_line_size].mean()

In [416]:
# calculate the moving ranges (1-lag diff - abs)
xmr['moving_ranges'] = np.abs(xmr['growth'] - xmr['growth'].shift())

In [417]:
# get the average moving ranges
xmr['avg_mr'] = xmr['moving_ranges'].mean()

In [418]:
# get the upper range limit
xmr['upper_range_limit'] = xmr['avg_mr'] * upper_range_limit

In [419]:
# calculate natural process limits
xmr['lnpl'] = xmr['central_line'] - xmr['avg_mr'] * range_factor
xmr['unpl'] = xmr['central_line'] + xmr['avg_mr'] * range_factor

In [420]:
#set source
source = ColumnDataSource(xmr)

# create the plot
p = figure(title = 'central_line: {:.04f}'.format(xmr['central_line'].mean()), 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Growth',
           plot_height = 400,
           plot_width = 900)

p.line(x='date', y='growth', line_width=2, source=source, color='red', legend_label='growth-rate')
p.line(x='date', y='central_line', line_width=2, source=source, color='black', legend_label='Lower Natural Process Limit')
p.line(x='date', y='lnpl', line_width=2, source=source, color='blue', legend_label='Lower Natural Process Limit')
p.line(x='date', y='unpl', line_width=2, source=source, color='blue', legend_label='Upper Natural Process Limit')

show(p)

In [421]:
#set source
source = ColumnDataSource(xmr)

# create the plot
p = figure(title = 'Moving Ranges plot', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Growth',
           plot_height = 400,
           plot_width = 900)

p.line(x='date', y='moving_ranges', line_width=2, source=source, color='red', legend_label='mr')

p.line(x='date', y='avg_mr', line_width=2, source=source, color='black', legend_label='avg_mr')
p.line(x='date', y='upper_range_limit', line_width=2, source=source, color='blue', legend_label='upper range limit')

show(p)

#### 7. Mortality Rate
***
* Use: start with a correlation. Shift death by x-days and check correlation.
* Auto-regressive Model