# COVID-19 statistics and trend

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as py
from scipy.optimize import curve_fit
from datetime import datetime
import re
import warnings
warnings.simplefilter('ignore')

In [2]:
pd.set_option("display.max_rows", 250)

In [3]:
pio.renderers.default = 'notebook'

In [4]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%d-%m-%Y %H:%M")
print("Updated on", current_time, "h")

Updated on 30-08-2020 18:18 h


In [5]:
################################ Loading xls

In [6]:
url = "https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide.xlsx"

In [7]:
cases = pd.read_excel(url)

In [8]:
#filepath = "C:\\Users\\edidd\\Documents\\Ubiqum\\Data Analytics Course\\covid19\\data\\"

In [9]:
#cases = pd.read_excel(os.path.join(filepath, "COVID-19-geographic-disbtribution-worldwide-2020-04-23.xlsx"))

In [10]:
# filepath2 = "C:\\Users\\edidd\\Documents\\covid19_JHU_CSSE\\COVID-19\\"

In [11]:
# data_jhu= pd.read_csv(os.path.join(filepath2, "who_covid_19_situation_reports\\who_covid_19_sit_rep_time_series\\who_covid_19_sit_rep_time_series.csv"))

In [12]:
############################# Data wrangling

In [13]:
cases.dateRep = pd.to_datetime(cases.dateRep, format="%Y-%m-%d")

In [14]:
cases = cases.rename(columns= {
    "dateRep": "date", 
    "countriesAndTerritories": "country", 
    "cases": "newcases", 
    "deaths": "newdeaths"})

In [15]:
cases = cases.sort_values(by=["country", "date"])

In [16]:
cases["cumcases"]= cases.groupby("country")["newcases"].cumsum()

In [17]:
######################### Deleting rows with 0 cumulative cases

In [18]:
cases= cases.set_index("country")

## Summary

In [19]:
# Grouping by country

In [20]:
cases_total= cases.groupby(["country"]).sum().sort_values(by= ["newcases"], ascending= False)

In [21]:
cases_total= cases_total.rename(columns= {"newcases": "total_cases", "newdeaths": "total_deaths"})

In [22]:
cases_total= cases_total.drop(["day", "month", "year", "popData2019", "cumcases", "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000"], axis= "columns")

In [23]:
cases_total["death_ratio"]= cases_total.total_deaths / cases_total.total_cases * 100

In [24]:
cases_total["population_2019"]= cases.groupby("country").max().popData2019

In [25]:
cases_total["cases_to_population"]= cases_total.total_cases / cases_total.population_2019 * 100

In [26]:
cases_total["cases_to_population"]= cases_total["cases_to_population"].map('{:,.2f}%'.format)

In [27]:
cases_total["death_ratio"]= cases_total["death_ratio"].map('{:,.2f}%'.format)

### Global total of confirmed cases

In [28]:
cases_total.total_cases.sum()

25029408

### Global total of deaths

In [29]:
cases_total.total_deaths.sum()

843158

### Totals per country

In [30]:
fig = go.Figure(data= [go.Bar(x= cases_total.index, 
                              y= cases_total.total_cases.head(10), 
                              text= "*" + cases_total.cases_to_population, 
                              textposition='auto')
                      ]
               )
fig.update_layout(title= "Total confirmed cases per country - top 10", 
                  xaxis_title= "* Showing percentage of country's population")
fig.show()

In [31]:
morethan1000 = cases_total.loc[cases_total.total_cases >= 1000, :].copy()

In [32]:
morethan1000["death_ratio"] = morethan1000.total_deaths / morethan1000.total_cases

In [33]:
fig= go.Figure()

fig.add_trace(go.Histogram(x= morethan1000.death_ratio, 
                           histnorm= "probability",
                           xbins=dict(
                               start=0,
                               end=morethan1000.death_ratio.max(),
                               size=.01
                           )
                          )
             )

fig.update_layout(title= "Global distribution of death ratio", 
                 xaxis=dict(tickformat= "%", tickangle=0, 
                            title='* Only countries with more than 999 reported cases are considered'))

pio.show(fig)

Complete list

In [34]:
cases_total.sort_values("total_cases", ascending= False)

Unnamed: 0_level_0,total_cases,total_deaths,death_ratio,population_2019,cases_to_population
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United_States_of_America,5961582,182779,3.07%,329064900.0,1.81%
Brazil,3846153,120462,3.13%,211049500.0,1.82%
India,3542733,63498,1.79%,1366418000.0,0.26%
Russia,985346,17025,1.73%,145872300.0,0.68%
Peru,639435,28607,4.47%,32510460.0,1.97%
South_Africa,622551,13981,2.25%,58558270.0,1.06%
Colombia,599914,19064,3.18%,50339440.0,1.19%
Mexico,591712,63819,10.79%,127575500.0,0.46%
Spain,439286,29011,6.60%,46937060.0,0.94%
Chile,408009,11181,2.74%,18952040.0,2.15%


## Trends

Trends shown for countries with more than 999 reported cases. Double-click on a country name in the legend next to each figure, in order to show only the selected country.

In [35]:
cases_total= cases_total.sort_values("country")

In [36]:
country_list= cases_total.loc[cases_total.total_cases >= 1000, :].index

In [37]:
fig= go.Figure()
for i, country in enumerate(country_list):
    fig.add_trace(go.Scatter(x= cases.loc[country].date, 
                             y= cases.loc[country].newcases, 
                             mode= "markers",
                             name= country))
fig.update_layout(title= "Daily new cases per country")

pio.show(fig)

In [38]:
cases["newcasesx100k"] = cases["newcases"] * 100000 / cases["popData2019"]

In [39]:
cases= cases.reset_index()

In [40]:
cases= cases.set_index("date")

In [41]:
start_date= cases.index.min()

In [42]:
end_date= cases.index.max()

In [43]:
days = (end_date - start_date).days 

In [44]:
offset = days % 7

In [45]:
cases_7d= cases.groupby("country").resample(pd.Timedelta(days= 7),
                                            offset= pd.Timedelta(days= offset), 
                                            closed= "right").sum().loc[:,["newcasesx100k"]]

In [46]:
fig= go.Figure()

for i, country in enumerate(country_list):
    fig.add_trace(go.Scatter(x= cases_7d.loc[country].index, 
                             y= cases_7d.loc[country].newcasesx100k, 
                             mode= "lines",
                             name= country))

fig.add_shape(type="line", 
              x0= cases.index.min(),
              y0= 50,
              x1= cases.index.max(),
              y1= 50,
              line= dict(color="Red", width=3, dash="dash")
             )
    
fig.update_layout(title= "Weekly new cases per 100,000 inhabitants", 
                  xaxis= dict(title= "Safety limit of 50 new cases per 100,000 people is marked with a red dashed line"))

pio.show(fig)

In [47]:
cases= cases.reset_index()

In [48]:
cases= cases.set_index("country")

In [49]:
fig = go.Figure()
for i, country in enumerate(country_list):
    fig.add_trace(go.Scatter(x= cases.loc[country].date, 
                             y= cases.loc[country].cumcases, 
                             mode= "lines",
                             name= country))
fig.update_layout(title="Cummulative cases per country")#, yaxis_type="log")
pio.show(fig)

In [50]:
# Growth factor

In [51]:
# cases["growfactor"] = cases.newcases / cases.groupby("country")["newcases"].shift(1).fillna(0)

In [52]:
# cases = cases.dropna()

In [53]:
# cases = cases.replace(np.inf, 0)

### Exponential fitting

In [54]:
cases= cases.loc[cases.cumcases >= 20,:]

In [55]:
cases["datemin"] = cases.groupby("country")["date"].min()

In [56]:
cases["days"]= cases.date - cases.datemin

In [57]:
cases.days= cases.days.dt.days

In [58]:
def exponential_growth(x, a, c_o):
    return c_o*a**x

In [59]:
def country_fit(df, country):
    x= df.loc[country].days
    y= df.loc[country].cumcases
    popt, pcov= curve_fit(exponential_growth, x, y)
    return popt

In [60]:
popt_list= []
popt_df= pd.DataFrame(columns= ["A"])

In [61]:
for i, country in enumerate(country_list):
    popt_list.append(country_fit(cases, country))
    popt_df.loc[country]= popt_list[i][0]

#### Factor A of exponential growth (C = Co * A^d)

It can be interpreted as an average daily increase factor of total cases. 

In [62]:
popt_df.sort_values("A", ascending= False)

Unnamed: 0,A
Trinidad_and_Tobago,1.084091
Aruba,1.07016
Gambia,1.060931
Bahamas,1.050291
Libya,1.04789
Namibia,1.047102
Syria,1.04093
Lebanon,1.03961
Ethiopia,1.03924
Paraguay,1.036957


#### Exponential fitting

In [63]:
fig= go.Figure()
for i, country in enumerate(country_list):
    fig.add_trace(go.Scatter(x= cases.loc[country].days, 
                             y= cases.loc[country].cumcases, 
                             mode= "lines",
                             name= country))
    fig.add_trace(go.Scatter(x= cases.loc[country].days,
                             y= exponential_growth(cases.loc[country].days, *popt_list[i]),
                             mode= "lines",
                             name= "exponential fit"))
fig.update_layout(title= "Exponential fit per country")

pio.show(fig)

###  Change in the number of reported new cases

A positive change means an increase in the number of reported cases. A negative change means a decrease in the number of new cases, that means good news!

In [64]:
country_list= cases_total.loc[cases_total.total_cases >= 1000, :].index

In [65]:
cases["growspeed"]= cases.newcases - cases.groupby("country").newcases.shift(1).fillna(0)

In [66]:
fig= go.Figure()
for i, country in enumerate(country_list):
    fig.add_trace(go.Scatter(x= cases.loc[country].date, 
                             y= cases.loc[country].growspeed, 
                             mode= "markers",
                             name= country))
fig.update_layout(title= "Daily change in number of new cases")

pio.show(fig)

In [67]:
#### Grow speed with different aggregations

In [68]:
cases= cases.reset_index()

In [69]:
cases= cases.set_index("date")

In [70]:
#cases_2d= cases.groupby("country").resample(pd.Timedelta(days= 2)).sum().loc[:,["newcases"]]

In [71]:
#cases_3d= cases.groupby("country").resample(pd.Timedelta(days= 3)).sum().loc[:,["newcases"]]

In [72]:
#cases_4d= cases.groupby("country").resample(pd.Timedelta(days= 4)).sum().loc[:,["newcases"]]

In [73]:
cases_7d= cases.groupby("country").resample(pd.Timedelta(days= 7)).sum().loc[:,["newcases"]]

In [74]:
# Grow speed

In [75]:
#cases_2d["growspeed"]= cases_2d.newcases - cases_2d.groupby("country").newcases.shift(1).fillna(0)

In [76]:
#fig= go.Figure()
#for i, country in enumerate(country_list):
#    fig.add_trace(go.Scatter(x= cases_2d.loc[country].index, 
#                             y= cases_2d.loc[country].growspeed, 
#                             mode= "markers",
#                             name= country))
#fig.update_layout(title= "Change in the number of new cases (2 days period)")
#pio.show(fig)

In [77]:
#cases_3d["growspeed"]= cases_3d.newcases - cases_3d.groupby("country").newcases.shift(1).fillna(0)

In [78]:
#fig= go.Figure()
#for i, country in enumerate(country_list):
#    fig.add_trace(go.Scatter(x= cases_3d.loc[country].index, 
#                             y= cases_3d.loc[country].growspeed, 
#                             mode= "markers",
#                             name= country))
#fig.update_layout(title= "Change in the number of new cases (3 days period)")

#pio.show(fig)

In [79]:
#cases_4d["growspeed"]= cases_4d.newcases - cases_4d.groupby("country").newcases.shift(1).fillna(0)

In [80]:
#fig= go.Figure()
#for i, country in enumerate(country_list):
#    fig.add_trace(go.Scatter(x= cases_4d.loc[country].index, 
#                             y= cases_4d.loc[country].growspeed, 
#                             mode= "markers",
#                             name= country))
#fig.update_layout(title= "Change in the number of new cases (4 days period)")
#pio.show(fig)

In [81]:
cases_7d["growspeed"]= cases_7d.newcases - cases_7d.groupby("country").newcases.shift(1).fillna(0)

In [82]:
fig= go.Figure()
for i, country in enumerate(country_list):
    fig.add_trace(go.Scatter(x= cases_7d.loc[country].index, 
                             y= cases_7d.loc[country].growspeed, 
                             mode= "markers",
                             name= country))
fig.update_layout(title= "Change in the number of new cases (1 week period)")
pio.show(fig)

In [83]:
#!jupyter nbconvert --to html --template toc2 EAP.ipynb