# EDA Covid 19
***

In [28]:
# add index

#### Quality Notes
* Recovered not always reported consistently by countries (i.e. lower than reality for NL/BE - two countries we can observe)
* 7-Day Moving Average on daily reported numbers show a cleaner trend
* Overall comparison between countries does not make much sense given their population; best to scale to numbers per million inhavitants (as is common for these datasets)


## Introduction
***
Analyse the data provided using exploratory visuals. Key aspects:
* What is the overall status today (totals)?
* How are countries performing? (and how has covid progressed)?
* What is the total daily increase? (with moving average)
* What is the effect scaled to the population?
* What is the mortality rate?
* Metric A: per country scaling over log-scale (minutephysics plot)
* Metrics B: growth KPI assessment (Stacey Barr)

Interesting countries:
* Netherlands
* Belgium
* Italy
* Spain
* China
* Malaysia
* United Kingdom
* United States


In [59]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

from sqlalchemy import create_engine
%load_ext sql

%load_ext autoreload
%autoreload 2

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
# load bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show 
from bokeh.models import Range1d, ColumnDataSource, CategoricalColorMapper, MultiLine, NumeralTickFormatter
from bokeh.palettes import Category10

output_notebook()

In [61]:
# function for querying db
from src.data.query_db import queryDB
qdb = queryDB('sqlite','../data/processed/covid.sqlite')

sqlite:///../data/processed/covid.sqlite


In [62]:
# quick check
qdb.execute_query('SELECT * FROM stats LIMIT 5;')

5 rows affected


Unnamed: 0,country,date,confirmed,death,recovered
0,Afghanistan,2020-01-22,0,0,0
1,Albania,2020-01-22,0,0,0
2,Algeria,2020-01-22,0,0,0
3,Andorra,2020-01-22,0,0,0
4,Angola,2020-01-22,0,0,0


In [63]:
qdb.engine.table_names()

['country', 'daily_stats', 'exp_stats', 'populations', 'stats']

### 1. Total Numbers

In [64]:
# get overall numbers
query = """
SELECT 
    DATE(date) AS date,
    SUM(confirmed) AS confirmed,
    SUM(death) AS death,
    SUM(recovered) AS recovered
FROM stats
GROUP BY date
ORDER BY date DESC;
"""

overall = qdb.execute_query(query)

152 rows affected


In [65]:
overall

Unnamed: 0,date,confirmed,death,recovered
0,2020-06-21,8950221,468283,4433024
1,2020-06-20,8789587,464417,4364328
2,2020-06-19,8662546,459969,4244173
3,2020-06-18,8486769,453931,4153495
4,2020-06-17,8347743,448911,4072351
...,...,...,...,...
147,2020-01-26,2118,56,52
148,2020-01-25,1434,42,39
149,2020-01-24,941,26,36
150,2020-01-23,654,18,30


In [66]:
query = """
    SELECT *
    FROM stats
    WHERE confirmed > 0 AND death > 0"""

df = qdb.execute_query(query)

14798 rows affected


In [67]:
df.groupby('date')[['confirmed','death','recovered']].sum()

Unnamed: 0_level_0,confirmed,death,recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22,548,17,28
2020-01-23,643,18,30
2020-01-24,920,26,36
2020-01-25,1406,42,39
2020-01-26,2075,56,49
...,...,...,...
2020-06-17,8345913,448911,4071081
2020-06-18,8484905,453931,4152149
2020-06-19,8660654,459969,4242810
2020-06-20,8787683,464417,4362962


In [68]:
#set source
source = ColumnDataSource(overall)

# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.line(x='date', y='confirmed', line_width=2, source=source, color='red', legend_label='confirmed')
p.line(x='date', y='death', line_width=2, source=source, color='blue', legend_label = 'death')
#p.line(x='date', y='recovered', line_width=2, source=source, color='green', legend_label = 'recovered')

#
#p.yaxis.major_label_overrides = y_set
p.yaxis.formatter=NumeralTickFormatter(format=",00")


p.legend.location = "top_left"
p.legend.click_policy="hide"

show(p)

help(p.yaxis)

#### 2. Top countries
***

In [69]:
# barplot with top-10 countries
def plotTop10(data, vals = 'confirmed', title = 'Top 10 countries with Covid'):
    #set source
    source = ColumnDataSource(data)

    p = figure(y_range=data['country'].iloc[::-1], plot_height=350, title=title,
               toolbar_location=None, tools="")

    p.hbar(y='country', right=vals, source = source, height=0.8)

    p.ygrid.grid_line_color = None
    p.x_range.start = 0

    show(p)

In [70]:
# get overall numbers
query = """
SELECT
    country, 
    confirmed,
    death,
    recovered
FROM stats
WHERE date = (SELECT MAX(date) FROM stats)
ORDER BY confirmed DESC
LIMIT 10;
"""

top10 = qdb.execute_query(query)

10 rows affected


In [71]:
plotTop10(top10)

In [72]:
# potential visual: show how the top 10 progates per day (popular barplot)

#### 3. Daily Increment
***

In [73]:
# get overall numbers
query = """
SELECT
    country, 
    confirmed,
    death,
    recovered
FROM daily_stats
WHERE date = (SELECT MAX(date) FROM daily_stats)
ORDER BY confirmed DESC
LIMIT 10;
"""

top10_daily = qdb.execute_query(query)

10 rows affected


In [74]:
plotTop10(top10_daily, vals = 'confirmed', title = 'most cases today')

#### barplot with actual cases + line for MA
source: https://medium.com/y-data-stories/the-beginners-guide-to-creating-interactive-dashboards-with-python-and-bokeh-part-i-3826ea197a1b

In [75]:
daily_s = qdb.get_daily_stats_country('Netherlands')

151 rows affected


In [76]:
query = """
    SELECT 
        date,
        SUM(confirmed) AS confirmed,
        SUM(death) AS death,
        SUM(confirmed_ma) AS confirmed_ma,
        SUM(death_ma) AS death_ma
    FROM daily_stats
    GROUP BY date;"""

qdb.execute_query(query)

151 rows affected


Unnamed: 0,date,confirmed,death,confirmed_ma,death_ma
0,2020-01-23,99,1,99.000000,1.000000
1,2020-01-24,287,8,193.000000,4.500000
2,2020-01-25,493,16,293.000000,8.333333
3,2020-01-26,684,14,390.750000,9.750000
4,2020-01-27,809,26,474.400000,13.000000
...,...,...,...,...,...
146,2020-06-17,176010,5274,139088.142857,4612.714286
147,2020-06-18,139026,5020,139178.857143,4645.428571
148,2020-06-19,175777,6038,145784.714286,4891.000000
149,2020-06-20,127041,4448,144701.857143,4916.857143


In [77]:
#set source
source = ColumnDataSource(daily_s)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='confirmed', source=source, color = 'red', alpha = 0.1, legend_label = 'confirmed cases')
p.line(x='date', y='confirmed_ma', line_width=2 ,source=source, color='red', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [78]:
# deaths
#set source
source = ColumnDataSource(daily_s)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='death', source=source, color = 'blue', alpha = 0.1, legend_label = 'deaths')
p.line(x='date', y='death_ma', line_width=2 ,source=source, color='blue', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [79]:
# recovered
#set source
source = ColumnDataSource(daily_s)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

p.vbar(x='date', width = bar_w, top='recovered', source=source, color = 'green', alpha = 0.1, legend_label = 'recovered')
p.line(x='date', y='recovered_ma', line_width=2 ,source=source, color='green', legend_label='7-day MA')

p.y_range.start = 0
p.legend.location = "top_left"

show(p)

In [80]:
# combined graph
#set source
source = ColumnDataSource(daily_s)

# use a time-delata for the width
bar_w = pd.Timedelta(hours = 12)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Persons',
           plot_height = 400,
           plot_width = 700)

# confirmed cases
p.vbar(x='date', width = bar_w, top='confirmed', source=source, color = 'red', alpha = 0.2, legend_label = 'confirmed cases')
p.line(x='date', y='confirmed_ma', line_width=2 ,source=source, color='red')

# deaths
p.vbar(x='date', width = bar_w, top='death', source=source, color = 'blue', alpha = 0.2, legend_label = 'deaths')
p.line(x='date', y='death_ma', line_width=2 ,source=source, color='blue')


p.y_range.start = 0
p.legend.location = "top_left"

show(p)

#### 4 Scale to population
***
Scraped data from: https://www.worldometers.info/world-population/population-by-country/

In [81]:
# get overall numbers
query = """
SELECT stats.country AS country,
       date,
       ROUND(confirmed/scaled_pop,2) AS conf_scaled,
       ROUND(death/scaled_pop,2) AS death_scaled,
       ROUND(recovered/scaled_pop,2) AS rec_scaled
  FROM stats
  JOIN (SELECT country, scaled_pop 
          FROM populations) AS pop
    ON stats.country = pop.country
 WHERE date = (SELECT MAX(date) 
                 FROM stats)
 ORDER BY conf_scaled DESC
 LIMIT 10;
"""

top10_scaled = qdb.execute_query(query)

10 rows affected


In [82]:
plotTop10(top10_scaled, vals = 'conf_scaled', title = 'most cases per 1M inhabitants')

#### for daily stats

In [83]:
# get overall numbers
query = """
SELECT daily_stats.country AS country,
       date,
       ROUND(confirmed/scaled_pop,2) AS conf_scaled,
       ROUND(death/scaled_pop,2) AS death_scaled,
       ROUND(recovered/scaled_pop,2) AS rec_scaled
  FROM daily_stats
  JOIN (SELECT country, 
               scaled_pop 
          FROM populations) AS pop
    ON daily_stats.country = pop.country
 WHERE date = (SELECT MAX(date) 
                 FROM daily_stats)
 ORDER BY conf_scaled DESC
 LIMIT 10;  
"""

top10_scaled_today = qdb.execute_query(query)

10 rows affected


In [84]:
plotTop10(top10_scaled_today, vals = 'conf_scaled', title = 'most cases per 1M inhabitants')

#### 5. MinutePhysics Metric
***
Source: https://www.youtube.com/watch?v=54XLXg4fYsc

Metric:
* y = new confirmed cass last week (rolling sum weeks)
* x = total confirmed cases

In [85]:
def setAxes(exp):
    """
    """
    # find the range required
    x_max = np.ceil(np.log10(exp.total_confirmed.max()))
    y_max = np.ceil(np.log10(exp.new_last_week.max()))
    
    # generate locations
    x_locs_raw = 10**np.arange(0,x_max+1,1)
    x_locs = [int(i) for i in x_locs_raw]
    y_locs_raw = 10**np.arange(0,y_max+1,1)
    y_locs = [int(i) for i in y_locs_raw]

    # generate labels
    x_labels = ['{:,.0f}'.format(v) for v in x_locs]
    y_labels = ['{:,.0f}'.format(v) for v in y_locs]

    # get required format
    x_set = dict(zip(x_locs, x_labels))
    y_set = dict(zip(y_locs, y_labels))

    return x_set, y_set, x_max, y_max

#### sample plot (mult. countries)

In [86]:
# netherlands, belgium, italy, united_states, Malaysia
query = """
SELECT *
  FROM exp_stats
 WHERE country IN ('Netherlands', 'Italy', 'Malaysia', 'China', 'United States', 'Brazil')
   AND total_confirmed > 25
"""

sample_exp = qdb.execute_query(query)

712 rows affected


In [87]:
sample_exp.head()

Unnamed: 0,date,country,total_confirmed,new_last_week
0,2020-01-23,China,643,95
1,2020-01-24,China,920,372
2,2020-01-25,China,1406,858
3,2020-01-26,China,2075,1527
4,2020-01-27,China,2877,2329


In [88]:
def getPlotDataSource(sample_exp, dt = '2020-02-01'):
    # get individual countries
    countries = sample_exp.country.unique().tolist()

    total_conf_set, last_week_set = [], []
    for country in countries:
        single_country = sample_exp[(sample_exp['country']==country) & (sample_exp['date'] > dt)]
        total_conf_set.append(np.array(single_country.total_confirmed))
        last_week_set.append(np.array(single_country.new_last_week))

    # create dict for ColumnDataSource
    data = {'xs' : total_conf_set,
            'ys' : last_week_set,
            'color' : [(Category10[n_countries])[i] for i in range(len(countries))],
            'country' : countries}

    return data

In [89]:
# setup
countries = sample_exp.country.unique()
n_countries = len(countries)


# create the plot
p = figure(title = 'Covid Progress', 
           x_axis_type = 'log',
           y_axis_type = 'log',
           x_axis_label = 'Total confirmed cases',
           y_axis_label = 'New cases last week',
           plot_height = 400,
           plot_width = 700)

# create the source
source = ColumnDataSource(data = getPlotDataSource(sample_exp, dt = '2020-02-01'))

# plot
p.multi_line(xs="xs", ys="ys", line_color="color", legend_field = 'country', line_width=2, source=source)
    
# get parameters for axes and baselines
x_set, y_set, x_max, y_max = setAxes(sample_exp)
    
# get grey shaded & dashed baseline
n = int(min(x_max,y_max))
ln = np.logspace(0, int(n), int(n))
p.line(ln, ln, line_dash="4 4", line_width=1, color='gray')

# set axes
p.xaxis.major_label_overrides = x_set
p.yaxis.major_label_overrides = y_set

p.legend.location = "top_left"
p.legend.click_policy="hide"

show(p)

#### 6. KPI Metric: XmR chart
***
Source: https://www.staceybarr.com/measure-up/an-insight-from-the-covid-19-metric-growth-factor/

Concept is to map a KPI, it's central line (avg first x measurements) and it's 'natural process limits' (normal statistical variance). Plotting these factors gives you insights in how your KPI is moving.

In [90]:
%%sql
-- calculate the daily new cases
SELECT
    date,
    confirmed AS conf
FROM daily_stats
WHERE country = 'Netherlands';

Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


In [57]:
res = _.DataFrame()

AttributeError: 'DataFrame' object has no attribute 'DataFrame'

In [411]:
# control parameters
central_line_size = 15 # window for base-line; user defined; chosen 15 (0.5 month)
upper_range_limit = 3.27 #fixed
range_factor = 2.66 #fixed


In [56]:
# add our growth
res['growth'] = res['conf'] / res['conf'].shift(fill_value = 0)
xmr = res[res['date']>'2020-04-01'].reset_index(drop=True)
xmr['date'] = pd.to_datetime(xmr['date'])


NameError: name 'res' is not defined

In [415]:
# calculate the central line (average of size .. )
xmr['central_line'] = xmr['growth'].iloc[:central_line_size].mean()

In [416]:
# calculate the moving ranges (1-lag diff - abs)
xmr['moving_ranges'] = np.abs(xmr['growth'] - xmr['growth'].shift())

In [417]:
# get the average moving ranges
xmr['avg_mr'] = xmr['moving_ranges'].mean()

In [418]:
# get the upper range limit
xmr['upper_range_limit'] = xmr['avg_mr'] * upper_range_limit

In [419]:
# calculate natural process limits
xmr['lnpl'] = xmr['central_line'] - xmr['avg_mr'] * range_factor
xmr['unpl'] = xmr['central_line'] + xmr['avg_mr'] * range_factor

In [420]:
#set source
source = ColumnDataSource(xmr)

# create the plot
p = figure(title = 'central_line: {:.04f}'.format(xmr['central_line'].mean()), 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Growth',
           plot_height = 400,
           plot_width = 900)

p.line(x='date', y='growth', line_width=2, source=source, color='red', legend_label='growth-rate')
p.line(x='date', y='central_line', line_width=2, source=source, color='black', legend_label='Lower Natural Process Limit')
p.line(x='date', y='lnpl', line_width=2, source=source, color='blue', legend_label='Lower Natural Process Limit')
p.line(x='date', y='unpl', line_width=2, source=source, color='blue', legend_label='Upper Natural Process Limit')

show(p)

In [421]:
#set source
source = ColumnDataSource(xmr)

# create the plot
p = figure(title = 'Moving Ranges plot', 
           x_axis_type="datetime",
           x_axis_label = 'Date',
           y_axis_label = 'Growth',
           plot_height = 400,
           plot_width = 900)

p.line(x='date', y='moving_ranges', line_width=2, source=source, color='red', legend_label='mr')

p.line(x='date', y='avg_mr', line_width=2, source=source, color='black', legend_label='avg_mr')
p.line(x='date', y='upper_range_limit', line_width=2, source=source, color='blue', legend_label='upper range limit')

show(p)

#### 7. Overall - Barplot
***
* Window function, partition by date, rank countries by cases (DESC), then select top-10 by date, join in population
* Get this dataframe & plot relevant section 

In [111]:
def getBarData(n=10):
    """
    Get top N countries per day by total confirmed cases
    """
    query = """
            SELECT
                *
            FROM (
                SELECT 
                    DATE(date) AS date,
                    country,
                    SUM(confirmed) AS confirmed,
                    ROW_NUMBER() OVER (PARTITION BY date ORDER BY SUM(confirmed) DESC) AS rnk
                FROM stats
                GROUP BY date, country) AS sub
            WHERE rnk <= {}
            """.format(n)
    
    return qdb.execute_query(query)

In [112]:
df_bars = getBarData()

1520 rows affected


In [95]:
# get overall numbers
query = """
SELECT
    *
FROM (
    SELECT 
        DATE(date) AS date,
        country,
        SUM(confirmed) AS confirmed,
        ROW_NUMBER() OVER (PARTITION BY date ORDER BY SUM(confirmed) DESC) AS rnk
    FROM stats
    GROUP BY date, country) AS sub
WHERE rnk <= 10
"""

df_bars = qdb.execute_query(query)

1520 rows affected


In [99]:
df_bars[df_bars['date'] == '2020-06-21'].sort_values('rnk')

Unnamed: 0,date,country,confirmed,rnk
1510,2020-06-21,United States,2279879,1
1511,2020-06-21,Brazil,1083341,2
1512,2020-06-21,Russia,583879,3
1513,2020-06-21,India,425282,4
1514,2020-06-21,United Kingdom,305803,5
1515,2020-06-21,Peru,251338,6
1516,2020-06-21,Spain,246272,7
1517,2020-06-21,Chile,242355,8
1518,2020-06-21,Italy,238499,9
1519,2020-06-21,Iran,204952,10


In [153]:
 #set source
data = df_bars[df_bars['date'] == '2020-03-26'].sort_values('rnk')
source = ColumnDataSource(data)
source.data['color'] = colors

p = figure(y_range=source.data['country'][::-1], plot_height=350, title='title',
           toolbar_location=None, tools="")

p.hbar(y='country', right='confirmed', fill_color = 'color', line_color = None, source = source, height=0.8)

p.ygrid.grid_line_color = None
p.x_range.start = 0

show(p)

#### 8. Continent Barplot
***

In [232]:
query = """
    SELECT
        date,
        continent,
        SUM(confirmed) AS confirmed
    FROM stats
    JOIN populations
        ON stats.country = populations.country
    WHERE continent != 'Seven seas (open ocean)'
    GROUP BY
        date,
        continent
    ORDER BY 
        date, 
        SUM(confirmed) DESC
    """

continent = qdb.execute_query(query)
continent.tail(6)

912 rows affected


Unnamed: 0,date,continent,confirmed
906,2020-06-21,North America,2659309
907,2020-06-21,Europe,2300820
908,2020-06-21,Asia,1902133
909,2020-06-21,South America,1770353
910,2020-06-21,Africa,306042
911,2020-06-21,Oceania,9013


In [236]:
#set source
data = continent[continent['date'] == '2020-06-21']
source = ColumnDataSource(data)

p = figure(y_range=source.data['continent'][::-1], plot_height=350, title='title',
           toolbar_location=None, tools="")

p.hbar(y='continent', right='confirmed', source = source, height=0.8)

p.ygrid.grid_line_color = None
p.x_range.start = 0

show(p)

In [243]:
# goal -- add this to the country-plot (race-chart)


array(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America'], dtype=object)