<b>Data by Countries and Territories</b>

In [1]:
# set rate / rates per million
population = 1000000

# get the cumulative cases and deaths by countries and territories
ct_cumulative_cases_deaths = data.groupby("countriesAndTerritories").\
        agg({'cases': 'sum', 'deaths': 'sum', 'popData2019': 'max'}).groupby(level=0).cumsum().reset_index()

# function to compute the features of:
#   fatality_rate
#   fatality_rate_per_million
#   case_rate_per_million
def get_features(x):
    d = {}
    
    d['fatality_rate'] = x["deaths"] / x["cases"] * 100
    d['fatality_rate_per_million'] = x["deaths"] / x["popData2019"] * population
    d['case_rate_per_million'] = x["cases"] / x["popData2019"] * population
    
    
    return pd.Series(d, index=['fatality_rate', 'fatality_rate_per_million', 'case_rate_per_million'])

# join the data cumulative cases and deaths with the computed features
ct_cumulative_cases_deaths_add_features = ct_cumulative_cases_deaths.\
    join(ct_cumulative_cases_deaths.apply(get_features, axis=1))

# get the cases and deaths for each country/territory in each year+month
ct_cases_deaths_byYearMonth_pvt = pd.pivot_table(data, values=['cases', 'deaths'],
             index = ['countriesAndTerritories'],
             columns=['year', 'month'],
             aggfunc=np.sum, fill_value = 0)

ct_cases_deaths_byYearMonth_pvt.columns = [ '_'.join([str(c) for c in c_list]) for c_list in ct_cases_deaths_byYearMonth_pvt.columns.values ]
ct_cases_deaths_byYearMonth = ct_cumulative_cases_deaths_pvt.reset_index()

# join the data cumulative cases and deaths, computed features, and  the cases and deaths in each year+month
ct_cumulative_cases_deaths_add_features = pd.merge(ct_cumulative_cases_deaths_add_features, 
                                                  ct_cases_deaths_byYearMonth, on='countriesAndTerritories')
# remove the duplicated index column
del ct_cumulative_cases_deaths_add_features['index']

# get the columns where cases_ and deaths_ exist
cols = [col for col in ct_cumulative_cases_deaths_add_features.columns if 'cases_' in col or 'deaths_' in col]

# save the columns for the apply function
cols_for_feature = np.array(cols)
cols_for_feature_names = [cols_for_feature[i]+"_incidence_rate" for i in range(cols_for_feature.size)]

# add the countriesAndTerritories and popData2019 columns to the list
cols.append('countriesAndTerritories')
cols.append('popData2019')

# set the data
ct_data_for_incidence_rate = ct_cumulative_cases_deaths_add_features[cols]
ct_data_for_incidence_rate

# calculate the incidence features in each month
def get_features_incidence(x):
    d = {}
    
    for c in cols_for_feature:
        d[c+"_incidence_rate"] = x[c] / x['popData2019'] * population
    
    return pd.Series(d, index=cols_for_feature_names)

# join the data cumulative cases and deaths, computed features, the cases and deaths in each year+month, and
# the incidence rate of cases and deaths in each year+month
ct_cumulative_cases_deaths_add_features = ct_cumulative_cases_deaths_add_features.\
    join(ct_data_for_incidence_rate.apply(get_features_incidence, axis=1))

NameError: name 'data' is not defined

In [None]:
#inspect data
ct_cumulative_cases_deaths_add_features.iloc[190:220]
#ct_cumulative_cases_deaths_add_features[ct_cumulative_cases_deaths_add_features['countriesAndTerritories'] == 'Germany']

In [None]:
import numpy as np

import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import cm



# create the dataset for scikit and analysis
dataset = {
    #'data': ct_cumulative_cases_deaths_add_features.iloc[:, 1:].to_numpy()
    'data': ct_cumulative_cases_deaths_add_features.to_numpy(),
    'feature_names': ct_cumulative_cases_deaths_add_features.columns.tolist(),
    'DESCR': 'COVID-19 dataset\n' +
            '--------------------------------------------------------\n' +
            'number of instances: ' + str(ct_cumulative_cases_deaths_add_features.size) + '\n' +
            '--------------------------------------------------------\n' +
            'countriesAndTerritories – the name of the country or territory\n' +
            'cases – the number of COVID-19 cases\n' +
            'deaths – the number of COVID-19 deaths\n' +
            'popData2019 – the population of the country or territory\n' +
            'fatality_rate – the fatality rate (deaths/cases)*100\n' +
            'fatality_rate_per_million – the fatality rate per million (deaths/cases)*1000000\n' +
            'case_rate_per_million – the case rate per million (cases/cases)*1000000\n' +
            '\n'.join([c + ' – the number of cases in given month\n' for c in cols_for_feature if 'cases_' in c]) +
            '\n'.join([c + ' – the number of deaths in given month\n' for c in cols_for_feature if 'deaths_' in c]) +
            '\n'.join([c + ' – the incident rate of cases in given month\n' for c in cols_for_feature_names if 'cases_' in c]) +
            '\n'.join([c + ' – the incident rate of deaths in given month' for c in cols_for_feature_names if 'deaths_' in c])
}

In [None]:
pop = distributions[0][1:2][0][:,0].tolist()
case_rate_per_million = distributions[0][1:2][0][:,1].tolist()

fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(pop, case_rate_per_million)
ax.set_xlabel('Population')
ax.set_ylabel('Case Rate Per Million')



In [None]:
pop = distributions[1][1:2][0][:,0].tolist()
case_rate_per_million = distributions[1][1:2][0][:,1].tolist()

fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(pop, case_rate_per_million)
ax.set_xlabel('Population')
ax.set_ylabel('Case Rate Per Million')

In [None]:
pop = distributions[3][1:2][0][:,0].tolist()
case_rate_per_million = distributions[3][1:2][0][:,1].tolist()

fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(pop, case_rate_per_million)
ax.set_xlabel('Population')
ax.set_ylabel('Case Rate Per Million')

In [None]:

ct_cases_deaths_byYearMonth[ct_cumulative_cases_deaths_pvt['countriesAndTerritories'] == 'United_States_of_America']

In [None]:
a = np.array(ct_cumulative_cases_deaths_pvt.unstack())
a

In [None]:
def get_rate_features(x):
    d = {}
    
    d['daily_cases_rate'] = x.daily_cases / x.popData2019 * 1000000
    d['daily_deaths_rate'] = x.daily_deaths / x.popData2019 * 1000000    
    d['cumulative_cases_rate'] = x.cumulative_cases / x.popData2019 * 1000000
    d['cumulative_deaths_rate'] = x.cumulative_deaths / x.popData2019 * 1000000
    
    return pd.Series(d, index=['daily_cases_rate', 'daily_deaths_rate', \
                               'cumulative_cases_rate', 'cumulative_deaths_rate'])


def get_features_by_variable(v):
    facts = data.groupby([v, "dateRep"]).\
        agg({'cases': 'sum', 'deaths': 'sum', 'popData2019': 'sum'}).reset_index()

    # aggregate stats culmatively per day by continent
    facts_culmative = data.groupby([v, "dateRep"]).\
        agg({'cases': 'sum', 'deaths': 'sum'}).groupby(level=0).cumsum().reset_index()

    # merge facts per day and per day cumulative
    features = pd.merge(facts, facts_culmative, how='left', \
                                  left_on=[v,'dateRep'], right_on = [v,'dateRep'])

    # rename columns
    features.rename(columns={'cases_x': 'daily_cases', 'deaths_x': 'daily_deaths', \
                                      'cases_y': 'cumulative_cases', 'deaths_y': 'cumulative_deaths'}, \
                                      inplace=True)
    
    features_with_rates = features.join(features.apply(get_rate_features, axis=1))
    
    return features

In [None]:
# by continent
continent_features = get_features_by_variable('continentExp')

#by countries and territories - limited to only the top 10
limit = 10
countriesAndTerritories_features = get_features_by_variable('countriesAndTerritories')
top = countriesAndTerritories_features[['countriesAndTerritories', 'popData2019']].drop_duplicates().\
    sort_values(by='popData2019', ascending=False).head(limit).countriesAndTerritories.values
countriesAndTerritories_features = \
    countriesAndTerritories_features[countriesAndTerritories_features.countriesAndTerritories.isin(top)]

In [None]:
continent_features.head(10)

In [None]:
countriesAndTerritories_features.head(10)

<b>Spread over time - by continent</b>

In [None]:
params = {'font.size': 12,
          "figure.figsize": (20,30)
         }
plt.rcParams.update(params)

fig, axs = plt.subplots(5,1)  # Create a figure and an axes

i = 0; j = 0
for c in continent_features_with_rates.continentExp.unique():
    plt_data = continent_features_with_rates[continent_features_with_rates['continentExp'] == c]
    axs[i].plot(plt_data['dateRep'], plt_data['cumulative_cases'], label=c)
    axs[i].set_title("COVID-19 Cases: Spread over time - " + c)  # Add a title to the axes.
    #ax[i, j].set_ylim([0, max_cases])
    
    i = i + 1
    
for ax in axs.flat:
    ax.set(xlabel='Time', ylabel='Cumulative Cases')

<b>New cases - by continent</b>

In [None]:
params = {'font.size': 12,
          "figure.figsize": (20,30)
         }
plt.rcParams.update(params)

fig, axs = plt.subplots(5,1)  # Create a figure and an axes

i = 0; j = 0
for c in continent_features_with_rates.continentExp.unique():
    plt_data = continent_features_with_rates[continent_features_with_rates['continentExp'] == c]
    axs[i].bar(plt_data['dateRep'], plt_data['daily_cases'], label=c)
    axs[i].set_title("COVID-19 New Cases: " + c)  # Add a title to the axes.
    #ax[i, j].set_ylim([0, max_cases])
    
    i = i + 1
    
for ax in axs.flat:
    ax.set(xlabel='Time', ylabel='New Cases')

<b>Distribution of cumulative cases - by continent</b>

In [None]:
continent_cumulative_frequency = continent_features.\
    groupby('continentExp').agg({'cumulative_cases': 'max'}).apply(lambda x: x / x.sum()).reset_index()

params = {'font.size': 12,
          "figure.figsize": (15,10)
         }
plt.rcParams.update(params)

fig, axs = plt.subplots()
axs.pie(continent_cumulative_frequency['cumulative_cases'].values, \
        labels=continent_cumulative_frequency['continentExp'].values, autopct='%1.1f%%',
        shadow=True, startangle=90)
axs.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

<b>Spread over time - by countries and territories</b>

In [2]:
params = {'font.size': 12,
          "figure.figsize": (20,50)
         }
plt.rcParams.update(params)

fig, axs = plt.subplots(int(limit/2),2)  # Create a figure and an axes

i = 0; j = 0
for c in countriesAndTerritories_features.countriesAndTerritories.unique():
    plt_data = countriesAndTerritories_features[countriesAndTerritories_features['countriesAndTerritories'] == c]
    plot(plt_data['dateRep'], plt_data['cumulative_cases'], label=c)
    axs[i, j].set_title("COVID-19 Cases: Spread over time - " + c)  # Add a title to the axes.
    #ax[i, j].set_ylim([0, max_cases])
    
    if j == 1:
        i = i + 1
        j = 0
    else:
        j = j + 1
    
for ax in axs.flat:
    ax.set(xlabel='Time', ylabel='Cumulative Cases')

NameError: name 'plt' is not defined

<b>New cases - by countries and territories</b>

In [None]:
params = {'font.size': 12,
          "figure.figsize": (20,30)
         }
plt.rcParams.update(params)

fig, axs = plt.subplots(int(limit/2),2)  # Create a figure and an axes

i = 0; j = 0
for c in countriesAndTerritories_features.countriesAndTerritories.unique():
    plt_data = countriesAndTerritories_features[countriesAndTerritories_features['countriesAndTerritories'] == c]
    axs[i, j].bar(plt_data['dateRep'], plt_data['daily_cases'], label=c)
    axs[i, j].set_title("COVID-19 New Cases: " + c)  # Add a title to the axes.
    #ax[i, j].set_ylim([0, max_cases])
    
    if j == 1:
        i = i + 1
        j = 0
    else:
        j = j + 1
    
for ax in axs.flat:
    ax.set(xlabel='Time', ylabel='New Cases')

<b>Distribution of cumulative cases - by countries and territories</b>

In [None]:
countriesAndTerritories_cumulative_frequency = countriesAndTerritories_features.\
    groupby('countriesAndTerritories').agg({'cumulative_cases': 'max'}).apply(lambda x: x / x.sum()).reset_index()

params = {'font.size': 12,
          "figure.figsize": (15,10)
         }
plt.rcParams.update(params)

fig, axs = plt.subplots()
axs.pie(countriesAndTerritories_cumulative_frequency['cumulative_cases'].values, \
        labels=countriesAndTerritories_cumulative_frequency['countriesAndTerritories'].values, autopct='%1.1f%%',
        shadow=True, startangle=90)
axs.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()