# Coronavirus Disease (COVID-19)

Data Source: https://github.com/owid/covid-19-data/tree/master/public/data/

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
import warnings

# Options and Settings

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.autolayout'] = True
plt.rcParams['figure.figsize'] = [12, 9]
plt.rcParams['font.size'] = 12
path = os.getcwd()                                         # get current working directory
warnings.simplefilter('ignore')

# Import Data

In [3]:
filepath = os.path.join('datasets', 'owid-covid-data.csv')
df = pd.read_csv(filepath)

# Head and Tail

In [4]:
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165175,ZWE,Africa,Zimbabwe,2022-02-24,234967.0,378.0,338.429,5390.0,2.0,1.286,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
165176,ZWE,Africa,Zimbabwe,2022-02-25,235467.0,500.0,348.143,5392.0,2.0,1.000,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
165177,ZWE,Africa,Zimbabwe,2022-02-26,235803.0,336.0,368.429,5393.0,1.0,1.000,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
165178,ZWE,Africa,Zimbabwe,2022-02-27,235803.0,0.0,350.143,5393.0,0.0,1.000,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,


In [5]:
df.shape

(165180, 67)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165180 entries, 0 to 165179
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    165180 non-null  object 
 1   continent                                   155289 non-null  object 
 2   location                                    165180 non-null  object 
 3   date                                        165180 non-null  object 
 4   total_cases                                 162150 non-null  float64
 5   new_cases                                   162099 non-null  float64
 6   new_cases_smoothed                          160948 non-null  float64
 7   total_deaths                                144355 non-null  float64
 8   new_deaths                                  144509 non-null  float64
 9   new_deaths_smoothed                         144379 non-null  float64
 

In [7]:
df['date'] = pd.to_datetime(df['date'])                   # convert date to pandas datetime

In [8]:
df['day_name'] = df['date'].dt.day_name()                 # add name of the day
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million,day_name
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,37.746,0.5,64.83,0.511,,,,,Monday
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,,,,,Tuesday
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,,,,,Wednesday
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,,,,,Thursday
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,,,,,Friday


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165180 entries, 0 to 165179
Data columns (total 68 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   iso_code                                    165180 non-null  object        
 1   continent                                   155289 non-null  object        
 2   location                                    165180 non-null  object        
 3   date                                        165180 non-null  datetime64[ns]
 4   total_cases                                 162150 non-null  float64       
 5   new_cases                                   162099 non-null  float64       
 6   new_cases_smoothed                          160948 non-null  float64       
 7   total_deaths                                144355 non-null  float64       
 8   new_deaths                                  144509 non-null  float64      

# Missing Data Exploration

In [10]:
df.isna().sum()                                           # over 50 % missing data in some columns

iso_code                                        0
continent                                    9891
location                                        0
date                                            0
total_cases                                  3030
                                            ...  
excess_mortality_cumulative_absolute       159484
excess_mortality_cumulative                159484
excess_mortality                           159484
excess_mortality_cumulative_per_million    159484
day_name                                        0
Length: 68, dtype: int64

Strategies for handling missing values: Using ffill()-forwardfill: according to the next observed value  OR bfill()-backfill: according to the last observed value 

In [11]:
df.bfill().isna().sum()                                   # 6126 rows still missing with the backward fill strategy

iso_code                                      0
continent                                     0
location                                      0
date                                          0
total_cases                                   0
                                           ... 
excess_mortality_cumulative_absolute       6126
excess_mortality_cumulative                6126
excess_mortality                           6126
excess_mortality_cumulative_per_million    6126
day_name                                      0
Length: 68, dtype: int64

In [12]:
df.ffill().isna().sum()                                   # 1487 rows missing with the forward fill strategy

iso_code                                      0
continent                                     0
location                                      0
date                                          0
total_cases                                   0
                                           ... 
excess_mortality_cumulative_absolute       1487
excess_mortality_cumulative                1487
excess_mortality                           1487
excess_mortality_cumulative_per_million    1487
day_name                                      0
Length: 68, dtype: int64

In [13]:
df_ffill = df.ffill()
df_ffill.isna().sum()

iso_code                                      0
continent                                     0
location                                      0
date                                          0
total_cases                                   0
                                           ... 
excess_mortality_cumulative_absolute       1487
excess_mortality_cumulative                1487
excess_mortality                           1487
excess_mortality_cumulative_per_million    1487
day_name                                      0
Length: 68, dtype: int64

In [14]:
df_ffill.describe()

Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
count,165180.0,165180.0,165175.0,165152.0,165152.0,165152.0,165180.0,165180.0,165175.0,165152.0,...,163697.0,163697.0,165180.0,165180.0,165180.0,165180.0,163693.0,163693.0,163693.0,163693.0
mean,2488950.0,11425.54,11580.35,56439.36,158.192368,158.827925,29656.548117,167.852918,172.368592,566.947183,...,11.470611,31.689481,48.405496,3.005906,73.482784,0.722305,68630.73,13.355385,24.859429,1661.102086
std,15025400.0,82945.93,81687.98,295786.6,789.548818,772.358465,51016.130209,680.470069,545.757959,828.871268,...,11.306356,13.894799,32.476283,2.483058,7.415484,0.150566,206411.4,14.499356,35.772052,2118.958575
min,1.0,-74347.0,-6223.0,1.0,-1918.0,-232.143,0.001,-13876.282,-1936.106,0.0,...,0.1,7.7,1.188,0.1,53.28,0.394,-37726.1,-28.45,-95.92,-1826.595723
25%,2073.0,1.0,7.0,85.0,0.0,0.143,651.363,0.059,1.637,23.1915,...,1.9,21.0,19.275,1.2,68.37,0.594,142.0,2.53,1.12,170.863057
50%,27632.0,80.0,107.571,852.0,2.0,2.0,4949.36,11.942,19.525,163.6195,...,6.4,30.4,44.6,2.32,75.0,0.74,5249.6,11.36,15.03,1017.459621
75%,304654.5,1068.0,1186.143,7800.0,19.0,20.0,39538.21225,101.313,128.3395,835.02,...,20.1,40.8,80.635,4.0,78.88,0.845,38756.3,23.47,45.8,2574.351715
max,436981900.0,4205408.0,3444963.0,5956509.0,18058.0,14705.857,697959.35,51427.491,16052.608,6315.219,...,44.0,78.1,100.0,13.8,86.75,0.957,1080748.0,111.01,374.93,9153.060433


# Time Series Resampling

In [15]:
df_halfhr = df_ffill.set_index('date').resample('30T').agg(np.sum)                              # half an hour
ts_hr = df_ffill.set_index('date').resample('H').agg(np.sum)                                    # hourly
ts_daily = df_ffill.set_index('date')                                                           # daily
ts_weekly = df_ffill.set_index('date').resample('W').agg([np.min, np.mean, np.max, np.sum])     # weekly
ts_mon15th = df_ffill.set_index('date').resample('SMS').agg([np.min, np.mean, np.max, np.sum])  # 15th (or other day_of_month) and calendar month begin      
ts_monthly = df_ffill.set_index('date').resample('M').agg([np.min, np.mean, np.max, np.sum])    # calendar month end
ts_quarterly = df_ffill.set_index('date').resample('Q').agg([np.min, np.mean, np.max, np.sum])  # calendar quarter end

# Data Visualization

In [16]:
# create a helper function for y-axis formatter
from matplotlib.ticker import FuncFormatter

def million(x, pos):
    'the two arguments are the value and tick position' 
    return '%1.1fM' % (x * 1e-6)

fomat_mill = FuncFormatter(million)

# create a line chart function
def plot_line(x, y, df, ax, hue=None, xlabel=None, ylabel=None, main_title=None, leg_title=None, loc=None, leg_labels=None):
    # initiate matplot figure
    fig, ax = plt.subplots()
    fig.autofmt_xdate()

    # create line chart
    sns.lineplot(x=x, y=y, hue=hue, ax=ax, data=df, ci=None)

    # label plot axis
    ax.set(xlabel=xlabel, ylabel=ylabel, title=main_title)

    # y-axis formatter
    ax.yaxis.set_major_formatter(fomat_mill)

    # label plot legend
    plt.legend(title=leg_title, loc=loc, labels=leg_labels)

    return plot_line

In [17]:
plot_line(x='date', y='total_cases', 
          df=df_ffill, ax=ax, hue='continent',   
          xlabel='year', ylabel='total cases', main_title='Covid-19 total cases by Continent')

NameError: name 'ax' is not defined

In [None]:
plot_line(x='date', y='total_deaths', 
          df=df_ffill, ax=ax, hue='continent',   
          xlabel='year', ylabel='total_deaths', main_title='Covid-19 total deaths by Continent')