# COVID-19 Compare Per Million of Habitants Country Trajectories 
> Comparing how countries trajectories are similar adjusting for population size. 

- comments: true
- author: Joao B. Duarte
- categories: [growth, compare, interactive]
- image: images/covid-permillion-trajectories.png
- permalink: /covid-compare-permillion/

In [27]:
#hide
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
%config InlineBackend.figure_format = 'retina'

In [28]:
#hide
# Get data and clean it

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 18
chart_size = (11,6)

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv", error_bad_lines=False)
data = data.drop(columns=["Lat", "Long"])
data = data.melt(id_vars= ["Province/State", "Country/Region"])
data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
data.reset_index(inplace=True)  
data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
data['date'] =pd.to_datetime(data.date)
data = data.sort_values(by = "date")
data.loc[data.location == "US","location"] = "United States"
data.loc[data.location == "Korea, South","location"] = "South Korea"

# Population data (last year is 2017 which is what we use)
data_pwt = pd.read_stata("https://www.rug.nl/ggdc/docs/pwt91.dta")

filter1 = data_pwt["year"] == 2017
data_pop = data_pwt[filter1]
data_pop = data_pop[["country","pop"]]
data_pop.loc[data_pop.country == "Republic of Korea","country"] = "South Korea"
data_pop.loc[data_pop.country == "Iran (Islamic Republic of)","country"] = "Iran"

# per habitant
data_pc = data.copy()

# I can add more countries if needed
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", 
             "Portugal", "United States", "Singapore","South Korea", "Japan", 
             "Brazil","Iran"]

data_countries = []
data_countries_pc = []

# compute per habitant
for i in countries:
    data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]/float(data_pop.loc[data_pop.country == i, "pop"])
    
# get each country time series
filter1 = data_pc["total_cases"] > 1

for i in countries:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])

## Cases per million of habitants since major outbreak

In [29]:
#hide_input
# Stack data to get it to Altair dataframe format
data_countries_pc2 = data_countries_pc.copy()
for i in range(0,len(countries)):
    data_countries_pc2[i] = data_countries_pc2[i].reset_index()
    data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
    data_countries_pc2[i]['log_cases'] = np.log(data_countries_pc2[i]["total_cases"])
data_plot = data_countries_pc2[0]
for i in range(1, len(countries)):    
    data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
data_plot["trend_2days"] = data_plot["n_days"]*1/2
data_plot["trend_4days"] = data_plot["n_days"]*1/4
data_plot["trend_12days"] = data_plot["n_days"]*1/12
data_plot["trend_2days_label"] = "Doubles every 2 days"
data_plot["trend_4days_label"] = "Doubles evey 4 days"
data_plot["trend_12days_label"] = "Doubles every 12 days"


# Plot it using Altair
source = data_plot

scales = alt.selection_interval(bind='scales')

base = alt.Chart(source, title = "COVID-19 Confirmed Cases Since Outbreak").encode(
    x = alt.X('n_days:Q', title = "Days passed since reaching 1 death per million of inhabitants"),
    y = alt.Y("log_cases:Q",title = "Log of Confirmed Cases Per Million of Habitants"),
    color = alt.Color('location:N', legend=alt.Legend(title="Country"))
)

lines = base.mark_line().add_selection(
    scales
)

trend_2d = alt.Chart(source).encode(
    x = "n_days:Q",
    y = alt.Y("trend_2days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
    stroke = alt.Stroke("trend_2days_label:N", legend = alt.Legend(title = "Time Trends")
                       )
).mark_line( strokeDash=[1,1], opacity= 0.5, color="#D3D3D3")

trend_4d = alt.Chart(source).mark_line(color="#D3D3D3", strokeDash=[1,1], opacity= 0.5).encode(
    x = "n_days:Q",
    y = alt.Y("trend_4days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
    stroke = "trend_4days_label:N"
)

trend_12d = alt.Chart(source).mark_line(color="#D3D3D3", strokeDash=[1,1], opacity= 0.5).encode(
    x = "n_days:Q",
    y = alt.Y("trend_12days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
    stroke = "trend_12days_label:N"
)

(trend_2d + trend_4d + trend_12d +  lines )

In [4]:
#hide
fig.savefig('../images/covid-permillion-trajectories.png')

FileNotFoundError: [Errno 2] No such file or directory: '../images/covid-permillion-trajectories.png'

Last Available Count on Confirmed Cases By Country

In [30]:
#hide_input
label = 'Cases'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

metric_name = f'{label} per Million'
temp.columns = ['Country', 'date', metric_name]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, f'Log of {label} per Million'] = temp[f'{label} per Million'].apply(lambda x: np.log10(x))


# summary = temp.set_index('date').groupby(['Country', 'month']).last()
# pd.pivot_table(summary, 
#                index='Country', 
#                values=[f'Log of Total {label} per Million',metric_name], 
#                columns='month').fillna('')

temp.groupby('Country').last()

Unnamed: 0_level_0,date,Cases per Million,Log of Cases per Million
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brazil,2020-03-19,2.967199,0.472347
China,2020-03-19,57.577151,1.76025
France,2020-03-19,162.837938,2.211756
Germany,2020-03-19,186.569374,2.27084
Iran,2020-03-19,226.791124,2.355626
Italy,2020-03-19,691.291579,2.839661
Japan,2020-03-19,7.247943,0.860215
Portugal,2020-03-19,75.99589,1.88079
Singapore,2020-03-19,60.432548,1.781271
South Korea,2020-03-19,167.999772,2.225309


## Deaths per million of habitants since major outbreak

In [31]:
#hide 
data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv", error_bad_lines=False)
data = data.drop(columns=["Lat", "Long"])
data = data.melt(id_vars= ["Province/State", "Country/Region"])
data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
data.reset_index(inplace=True)  
data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
data['date'] =pd.to_datetime(data.date)
data = data.sort_values(by = "date")
data.loc[data.location == "US","location"] = "United States"
data.loc[data.location == "Korea, South","location"] = "South Korea"

data_pwt = pd.read_stata("https://www.rug.nl/ggdc/docs/pwt91.dta")

filter1 = data_pwt["year"] == 2017
data_pop = data_pwt[filter1]
data_pop = data_pop[["country","pop"]]
data_pop.loc[data_pop.country == "Republic of Korea","country"] = "South Korea"
data_pop.loc[data_pop.country == "Iran (Islamic Republic of)","country"] = "Iran"

# per habitant
data_pc = data.copy()
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", 
             "Portugal", "United States", "Singapore","South Korea", "Japan", 
             "Brazil","Iran"]
data_countries = []
data_countries_pc = []

# compute per habitant
for i in countries:
    data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]/float(data_pop.loc[data_pop.country == i, "pop"])

    # get each country time series
filter1 = data_pc["total_cases"] > 1
for i in countries:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])

In [32]:
#hide_input
# Stack data to get it to Altair dataframe format
data_countries_pc2 = data_countries_pc.copy()
for i in range(0,len(countries)):
    data_countries_pc2[i] = data_countries_pc2[i].reset_index()
    data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
    data_countries_pc2[i]['log_cases'] = np.log(data_countries_pc2[i]["total_cases"])
data_plot = data_countries_pc2[0]
for i in range(1, len(countries)):    
    data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
data_plot["trend_2days"] = data_plot["n_days"]*1/2
data_plot["trend_4days"] = data_plot["n_days"]*1/4
data_plot["trend_12days"] = data_plot["n_days"]*1/12
data_plot["trend_2days_label"] = "Doubles every 2 days"
data_plot["trend_4days_label"] = "Doubles evey 4 days"
data_plot["trend_12days_label"] = "Doubles every 12 days"


# Plot it using Altair
source = data_plot

scales = alt.selection_interval(bind='scales')

base = alt.Chart(source, title = "COVID-19 Confirmed Cases Since Outbreak").encode(
    x = alt.X('n_days:Q', title = "Days passed since reaching 1 death per million of inhabitants"),
    y = alt.Y("log_cases:Q",title = "Log of Confirmed Cases Per Million of Habitants"),
    color = alt.Color('location:N', legend=alt.Legend(title="Country"))
)

lines = base.mark_line().add_selection(
    scales
)

trend_2d = alt.Chart(source).encode(
    x = "n_days:Q",
    y = alt.Y("trend_2days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
    stroke = alt.Stroke("trend_2days_label:N", legend = alt.Legend(title = "Time Trends")
                       )
).mark_line( strokeDash=[1,1], opacity= 0.5, color="#D3D3D3")

trend_4d = alt.Chart(source).mark_line(color="#D3D3D3", strokeDash=[1,1], opacity= 0.5).encode(
    x = "n_days:Q",
    y = alt.Y("trend_4days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
    stroke = "trend_4days_label:N"
)

trend_12d = alt.Chart(source).mark_line(color="#D3D3D3", strokeDash=[1,1], opacity= 0.5).encode(
    x = "n_days:Q",
    y = alt.Y("trend_12days:Q",  scale=alt.Scale(domain=(0, max(data_plot["log_cases"])))),
    stroke = "trend_12days_label:N"
)

(trend_2d + trend_4d + trend_12d +  lines )

In [33]:
#hide_input
label = 'Deaths'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

metric_name = f'{label} per Million'
temp.columns = ['Country', 'date', metric_name]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, f'Log of {label} per Million'] = temp[f'{label} per Million'].apply(lambda x: np.log10(x))

temp.groupby('Country').last()

# summary = temp.set_index('date').groupby(['Country', 'month']).last()
# pd.pivot_table(summary, 
#                index='Country', 
#                values=[f'Log of Total {label} per Million',metric_name], 
#                columns='month').fillna('')

Unnamed: 0_level_0,date,Deaths per Million,Log of Deaths per Million
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,2020-03-19,2.305044,0.362679
France,2020-03-19,3.614654,0.558067
Iran,2020-03-19,15.820058,1.199208
Italy,2020-03-19,57.361955,1.758624
South Korea,2020-03-19,1.784936,0.251623
Spain,2020-03-19,17.905559,1.252988
United Kingdom,2020-03-19,2.085172,0.319142


This analysis was conducted by [Joao B. Duarte](https://www.jbduarte.com). Relevant sources are listed below: 


1. ["2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE"](https://systems.jhu.edu/research/public-health/ncov/) [GitHub repository](https://github.com/CSSEGISandData/COVID-19). 

2. [Feenstra, Robert C., Robert Inklaar and Marcel P. Timmer (2015), "The Next Generation of the Penn World Table" American Economic Review, 105(10), 3150-3182](https://www.rug.nl/ggdc/productivity/pwt/related-research)