# COVID-19 Deaths, Cases & Recovery Per Capita
> Compare deaths and total cases adjusting for population size. 

- comments: true
- author: Joao B. Duarte. Hamel Husain & Arun Gupta
- categories: [growth, compare, interactive]
- hide: false
- image: images/covid-permillion-trajectories.png
- permalink: /covid-compare-permillion/

In [1]:
#hide
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from load_covid_data import load_individual_timeseries
%config InlineBackend.figure_format = 'retina'

chart_width = 550
chart_height= 400

## Deaths Per Million Of Inhabitants

Last 200 days

> Tip: Click (Shift+ for multiple) on countries in the legend to filter the visualization.

In [51]:
#hide 
data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 
                   error_bad_lines=False)
data = data.iloc[:,[0,1,2,3] +  [i for i in range(len(data.iloc[0,:])-1,len(data.iloc[0,:])-200,-1)]]
data = data.drop(columns=["Lat", "Long"])
data = data.melt(id_vars= ["Province/State", "Country/Region"])
data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
data.reset_index(inplace=True)  
data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
data['date'] =pd.to_datetime(data.date)
data = data.sort_values(by = "date")
data.loc[data.location == "US","location"] = "United States"
data.loc[data.location == "Korea, South","location"] = "South Korea"

data_pwt = pd.read_stata("https://www.rug.nl/ggdc/docs/pwt91.dta")

filter1 = data_pwt["year"] == 2017
data_pop = data_pwt[filter1]
data_pop = data_pop[["country","pop"]]
data_pop.loc[data_pop.country == "Republic of Korea","country"] = "South Korea"
data_pop.loc[data_pop.country == "Iran (Islamic Republic of)","country"] = "Iran"

# per habitant
data_pc = data.copy()
countries = ["Italy", "Spain", "France", "United Kingdom", "Germany", 
             "Portugal", "United States", "Singapore", "South Korea", "Japan", 
             "Brazil", "Iran", 'Netherlands', 'Belgium', 'Sweden', "Romania",
             'Switzerland', 'Norway', 'Denmark', 'Austria', 'Slovenia', 'Greece']
data_countries = []
data_countries_pc = []

# compute per habitant
for i in countries:
    data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]/float(data_pop.loc[data_pop.country == i, "pop"])

# get each country time series
filter1 = data_pc["total_cases"] > 1
for i in countries:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])

In [52]:
#hide_input
# Stack data to get it to Altair dataframe format
data_countries_pc2 = data_countries_pc.copy()
for i in range(0,len(countries)):
    data_countries_pc2[i] = data_countries_pc2[i].reset_index()
    data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
    data_countries_pc2[i]['log_cases'] = np.log(data_countries_pc2[i]["total_cases"])
data_plot = data_countries_pc2[0]
for i in range(1, len(countries)):    
    data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
data_plot["trend_2days"] = np.log(2)/2*data_plot["n_days"]
data_plot["trend_4days"] = np.log(2)/4*data_plot["n_days"]
data_plot["trend_12days"] = np.log(2)/12*data_plot["n_days"]
data_plot["trend_2days_label"] = "Doubles every 2 days"
data_plot["trend_4days_label"] = "Doubles evey 4 days"
data_plot["trend_12days_label"] = "Doubles every 12 days"


# Plot it using Altair
source = data_plot

scales = alt.selection_interval(bind='scales', zoom=False)
selection = alt.selection_multi(fields=['location'], bind='legend')

base = alt.Chart(source, title = "COVID-19 Deaths Per Million of Inhabitants").encode(
    x = alt.X('n_days:Q', title = "Last 200 Days"),
    y = alt.Y("log_cases:Q",title = "Log of deaths per million"),
    color = alt.Color('location:N', legend=alt.Legend(title="Country", labelFontSize=15, titleFontSize=17),
                     scale=alt.Scale(scheme='tableau20')),
    opacity = alt.condition(selection, alt.value(1), alt.value(0.1))
)

lines = base.mark_line().add_selection(
    scales
).add_selection(
    selection
).properties(
    width=chart_width,
    height=chart_height
)

trend_2d = alt.Chart(source).encode(
    x = "n_days:Q",
    y = alt.Y("trend_2days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
).mark_line(color="grey", strokeDash=[3,3])


labels = pd.DataFrame([{'label': 'Doubles every 2 days', 'x_coord': 11.4, 'y_coord': 6.5},
                       {'label': 'Doubles every 4 days', 'x_coord': 30.5, 'y_coord': 6.5},
                       {'label': 'Doubles every 12 days', 'x_coord': 46, 'y_coord': 3},
                      ])
trend_label = (alt.Chart(labels)
                    .mark_text(align='left', dx=-55, dy=-15, fontSize=12, color="grey")
                    .encode(x='x_coord:Q',
                            y='y_coord:Q',
                            text='label:N')
                   )

trend_4d = alt.Chart(source).mark_line(color="grey", strokeDash=[3,3]).encode(
    x = "n_days:Q",
    y = alt.Y("trend_4days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
)

trend_12d = alt.Chart(source).mark_line(color="grey", strokeDash=[3,3]).encode(
    x = "n_days:Q",
    y = alt.Y("trend_12days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
)


plot1= (
(trend_2d + trend_4d + trend_12d + trend_label + lines)
.configure_title(fontSize=20)
.configure_axis(labelFontSize=15,titleFontSize=18)
)
#plot1.save(("../images/covid-permillion-trajectories.png"))
plot1

Last Available Total Deaths By Country:

In [4]:
#hide_input
label = 'Deaths'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

metric_name = f'{label} per Million'
temp.columns = ['Country', 'date', metric_name]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, f'Log of {label} per Million'] = temp[f'{label} per Million'].apply(lambda x: np.log(x))

temp.groupby('Country').last()

Unnamed: 0_level_0,date,Deaths per Million,Log of Deaths per Million
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Austria,2021-01-07,751.87861,6.622575
Belgium,2021-01-07,1744.283369,7.464099
Brazil,2021-01-07,957.999157,6.864847
Denmark,2021-01-07,259.52503,5.558853
France,2021-01-07,992.170497,6.899895
Germany,2021-01-07,474.78983,6.162872
Greece,2021-01-07,461.120496,6.133659
Iran,2021-01-07,689.145865,6.535453
Italy,2021-01-07,1302.074265,7.171714
Japan,2021-01-07,28.819201,3.361042


In [53]:
#hide
# Get data and clean it

data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", error_bad_lines=False)
data = data.iloc[:,[0,1,2,3] +  [i for i in range(len(data.iloc[0,:])-1,len(data.iloc[0,:])-200,-1)]]
data = data.drop(columns=["Lat", "Long"])
data = data.melt(id_vars= ["Province/State", "Country/Region"])
data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
data.reset_index(inplace=True)  
data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
data['date'] =pd.to_datetime(data.date)
data = data.sort_values(by = "date")
data.loc[data.location == "US","location"] = "United States"
data.loc[data.location == "Korea, South","location"] = "South Korea"

# Population data (last year is 2017 which is what we use)
data_pwt = pd.read_stata("https://www.rug.nl/ggdc/docs/pwt91.dta")

filter1 = data_pwt["year"] == 2017
data_pop = data_pwt[filter1]
data_pop = data_pop[["country","pop"]]
data_pop.loc[data_pop.country == "Republic of Korea","country"] = "South Korea"
data_pop.loc[data_pop.country == "Iran (Islamic Republic of)","country"] = "Iran"

# per habitant
data_pc = data.copy()

# I can add more countries if needed
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", 
             "Portugal", "United States", "Singapore","South Korea", "Japan", 
             "Brazil", "Iran", "Romania"]

data_countries = []
data_countries_pc = []

# compute per habitant
for i in countries:
    data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]/float(data_pop.loc[data_pop.country == i, "pop"])
    
# get each country time series
filter1 = data_pc["total_cases"] > 1

for i in countries:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])

## Cases Per Million of Habitants

Since reaching at least 1 case per million

> Note: The following chart, "Cases Per Million of Habitants" is biased depending on how widely a country administers tests. Please read with caution.

> Tip: Click (Shift+ for multiple) on countries in the legend to filter the visualization.

In [55]:
#hide_input
# Stack data to get it to Altair dataframe format
data_countries_pc2 = data_countries_pc.copy()
for i in range(0,len(countries)):
    data_countries_pc2[i] = data_countries_pc2[i].reset_index()
    data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
    data_countries_pc2[i]['log_cases'] = np.log(data_countries_pc2[i]["total_cases"])
data_plot = data_countries_pc2[0]
for i in range(1, len(countries)):    
    data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
data_plot["trend_2days"] = np.log(2)/2*data_plot["n_days"]
data_plot["trend_4days"] = np.log(2)/4*data_plot["n_days"]
data_plot["trend_12days"] = np.log(2)/12*data_plot["n_days"]
data_plot["trend_2days_label"] = "Doubles every 2 days"
data_plot["trend_4days_label"] = "Doubles evey 4 days"
data_plot["trend_12days_label"] = "Doubles every 12 days"


# Plot it using Altair
source = data_plot

scales = alt.selection_interval(bind='scales', zoom=False)
selection = alt.selection_multi(fields=['location'], bind='legend')

base = alt.Chart(source, title = "COVID-19 Confirmed Cases Per Million of Inhabitants").encode(
    x = alt.X('n_days:Q', title = "Last 200 Days"),
    y = alt.Y("log_cases:Q",title = "Log of confirmed cases per million"),
    color = alt.Color('location:N', legend=alt.Legend(title="Country", labelFontSize=15, titleFontSize=17),
                     scale=alt.Scale(scheme='tableau20')),
    opacity = alt.condition(selection, alt.value(1), alt.value(0.1))
).properties(
    width=chart_width,
    height=chart_height
)

lines = base.mark_line().add_selection(
    scales
).add_selection(
    selection
)

trend_2d = alt.Chart(source).encode(
    x = "n_days:Q",
    y = alt.Y("trend_2days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
).mark_line( strokeDash=[3,3], color="grey")

labels = pd.DataFrame([{'label': 'Doubles every 2 days', 'x_coord': 13, 'y_coord': 8.3},
                       {'label': 'Doubles every 4 days', 'x_coord': 37.5, 'y_coord': 8.3},
                       {'label': 'Doubles every 12 days', 'x_coord': 71, 'y_coord': 3},
                      ])
trend_label = (alt.Chart(labels)
                    .mark_text(align='left', dx=-55, dy=-15, fontSize=12, color="grey")
                    .encode(x='x_coord:Q',
                            y='y_coord:Q',
                            text='label:N')
                   )


trend_4d = alt.Chart(source).mark_line(color="grey", strokeDash=[3,3]).encode(
    x = "n_days:Q",
    y = alt.Y("trend_4days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
)

trend_12d = alt.Chart(source).mark_line(color="grey", strokeDash=[3,3]).encode(
    x = "n_days:Q",
    y = alt.Y("trend_12days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
)

(
(trend_2d  + trend_4d + trend_12d + trend_label + lines)
.configure_title(fontSize=20)
.configure_axis(labelFontSize=15,titleFontSize=18)
)

In [7]:
#hide_input

#Added by Arun Gupta for Recovery and Death Rates
# load data for infections, deaths, and recovered
df_confirmed = load_individual_timeseries('confirmed')
df_death = load_individual_timeseries('deaths')
df_recovered = load_individual_timeseries('recovered')

# clean the data
df_confirmed = df_confirmed[~df_confirmed['country'].str.contains(' \(total\)')].drop(['state', 'type'], axis=1, ).reset_index()
df_death = df_death[~df_death['country'].str.contains(' \(total\)')].drop(['state', 'type'], axis=1, ).reset_index()
df_recovered = df_recovered[~df_recovered['country'].str.contains(' \(total\)')].drop(['state', 'type'], axis=1, ).reset_index()

# get the data only for the latest date
Latest_Date = df_confirmed['date'].max()
df_confirmed = df_confirmed.loc[df_confirmed['date'] == Latest_Date]
df_death = df_death.loc[df_death['date'] == Latest_Date]
df_recovered = df_recovered.loc[df_recovered['date'] == Latest_Date]

# remove column 'date' as it is no more required
df_confirmed = df_confirmed.drop('date', axis=1)
df_death = df_death.drop('date', axis=1)
df_recovered = df_recovered.drop('date', axis=1)

# aggregate data
df_confirmed = (df_confirmed.sort_values(by=['country'])
                            .groupby(['country'])
                            .agg(sum)).reset_index()

df_death = (df_death.sort_values(by=['country'])
                    .groupby(['country'])
                    .agg(sum)).reset_index()

df_recovered = (df_recovered.sort_values(by=['country'])
                            .groupby(['country'])
                            .agg(sum)).reset_index()

# rename the column for 'cases' in respective datasets
df_confirmed = df_confirmed.rename(columns={"cases": "infections"})
df_death = df_death.rename(columns={"cases": "deaths"})
df_recovered = df_recovered.rename(columns={"cases": "recovered"})

# (inner)join the datasets for 'confirmed' and 'death' on 'country'
df_Master = pd.merge(df_confirmed, df_death, how='inner', on='country', left_on=None, right_on=None, left_index=False, right_index=False, sort=True)
df_Master = pd.merge(df_Master, df_recovered, how='inner', on='country', left_on=None, right_on=None, left_index=False, right_index=False, sort=True)

# rename the Countries
df_Master['country'] = df_Master['country'].replace({'Bosnia and Herzegovina':'Bosnia Herzegovina',
                                                     'Timor-Leste'           :'East Timor',
                                                     "Cote d'Ivoire"         :'Ivory Coast',
                                                     'Burma'                 :'Myanmar',
                                                     'Korea, South'          :'South Korea',
                                                     'Taiwan*'               :'Taiwan',
                                                     'US'                    :'United States of America',
                                                     'Holy See'              :'Vatican City'})

## remove the countries that have less than 200 confirmed cases
case_threshold = 200 
keep_countries = df_Master.loc[(df_Master['infections'] > case_threshold)].country
df_Master = df_Master.loc[df_Master['country'].isin(keep_countries)]

# calculate 'deaths' and 'recovery' per 1000 infections
df_Master['Deaths_per_1000'] = round(1000 * (df_Master['deaths']/df_Master['infections']), 2)
df_Master['Recovered_per_1000'] = round(1000 * (df_Master['recovered']/df_Master['infections']), 2)

# get countries with top 50 death rates
df_top_death = df_Master.sort_values(by=['Deaths_per_1000'], ascending=False)
df_top_death = df_top_death.head(50)

# get countries with top 50 recovery rates
df_top_recovered = df_Master.sort_values(by=['Recovered_per_1000'], ascending=False)
df_top_recovered = df_top_recovered.head(50)

# get rest of the countries
df_ROW = df_Master.loc[~df_Master['country'].isin(df_top_death.country)]
df_ROW = df_ROW.loc[~df_ROW['country'].isin(df_top_recovered.country)]

This graph shows the 50 countries that are facing highest rate of deaths per 1000 infections, across the world. As it is not so intuitive that the developed countries such as Belgium, France, Netherlands are among the top countries having highest death rate.

In [8]:
#hide_input
# make the bar-chart for countries on "deaths per 1000 infections"
alt.Chart(df_top_death).mark_bar(color='Orange').encode(
    x= alt.Y('country:N', sort='-y', title="Countries"),
    y=alt.Y('Deaths_per_1000', title="Deaths per 1000 infected people")
).properties(
    title='Countries with top death rates')

Further, we move on to the countries with best recovery rate. The graph below shows 50 countries with highest rate of recovery.

In [9]:
#hide_input
# make the bar-chart for countries on "recovery per 1000 infections"
alt.Chart(df_top_recovered).mark_bar(color='#00CC66').encode(
    x= alt.Y('country:N', sort='-y', title="Countries"),
    y=alt.Y('Recovered_per_1000', title="Recovered per 1000 infected people")
).properties(
    title='Countries with top recovery rates')

Finally we project the data from above graphs on to a scatter plot. The Orange and Green bubbles are from the above two bar charts. Additionally, the bubbles in Grey color are the rest of the countries.

In [10]:
#hide_input
# make the scatter plot for "Death Rates by Population Density"
chart_high_recovery = alt.Chart(df_top_recovered).mark_circle(size=150, color='#00CC66').encode(
    x=alt.Y('Recovered_per_1000',  title="Recovery per 1000 infections"),
    y=alt.Y('Deaths_per_1000',  title="Deaths per 1000 infections"),
    tooltip=['country', 'Recovered_per_1000', 'Deaths_per_1000']
).properties(
    width=700,
    height=450)

chart_high_death = alt.Chart(df_top_death).mark_circle(size=150, color='Orange').encode(
    x=alt.Y('Recovered_per_1000',  title="Recovery per 1000 infections"),
    y=alt.Y('Deaths_per_1000',  title="Deaths per 1000 infections"),
    tooltip=['country', 'Recovered_per_1000', 'Deaths_per_1000']
).properties(
    width=700,
    height=450)

chart_rest = alt.Chart(df_ROW).mark_circle(size=150, color='#C3C3C3').encode(
    x=alt.Y('Recovered_per_1000'),
    y=alt.Y('Deaths_per_1000'),
    tooltip=['country', 'Recovered_per_1000', 'Deaths_per_1000']
).properties(
    width=700,
    height=450)

(chart_high_recovery + chart_high_death + chart_rest).interactive()

## Appendix

Last Available Cases Per Million By Country:

In [11]:
#hide_input
label = 'Cases'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

metric_name = f'{label} per Million'
temp.columns = ['Country', 'date', metric_name]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, f'Log of {label} per Million'] = temp[f'{label} per Million'].apply(lambda x: np.log(x))

temp.groupby('Country').last()

Unnamed: 0_level_0,date,Cases per Million,Log of Cases per Million
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brazil,2020-04-27,322.263619,5.77537
China,2020-04-27,59.536687,4.086593
France,2020-04-27,2468.719524,7.811455
Germany,2020-04-27,1933.379942,7.567025
Iran,2020-04-27,1127.018943,7.027331
Italy,2020-04-27,3359.405848,8.119519
Japan,2020-04-27,111.01746,4.709687
Portugal,2020-04-27,2326.055107,7.751929
Singapore,2020-04-27,2526.430838,7.834563
South Korea,2020-04-27,210.897087,5.35137


This analysis was conducted by [Joao B. Duarte](http://jbduarte.com). Assitance with creating visualizations were provided by [Hamel Husain](https://twitter.com/HamelHusain). 
The highest Recovery and Death rates were added by [Arun Gupta](https://www.linkedin.com/in/arungupta21/). 
Relevant sources are listed below: 


1. ["2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE"](https://systems.jhu.edu/research/public-health/ncov/) [GitHub repository](https://github.com/CSSEGISandData/COVID-19). 

2. [Feenstra, Robert C., Robert Inklaar and Marcel P. Timmer (2015), "The Next Generation of the Penn World Table" American Economic Review, 105(10), 3150-3182](https://www.rug.nl/ggdc/productivity/pwt/related-research)