First, let's import all the packages we will need for this project

In [2]:
import requests
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

Configure pandas

In [99]:
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_rows', 30)
pd.set_option('display.min_rows', None)

Filters

In [None]:
yesterday = datetime.date(datetime.today() - timedelta(1))
filter_date = data['Date'] == str(yesterday)
filter_country = data['Country'] == 'United States of America'
filter_province = data['Province'] == ''

To see the disease development over time, let's access the Covid-19 data API kindly provided by the [Johns Hopkins CSSE](https://covid19api.com/).

[Postman COVID-19 API Resource Center](https://covid-19-apis.postman.com/)

In [68]:
def getInstructions():
    
    r = requests.get('https://api.covid19api.com/')
    instructions_dic = r.json()
    instructions = pd.DataFrame.from_dict(instructions_dic).transpose()
    
    return instructions

In [55]:
def timeSeriesData(list_countries = ['estonia', 'latvia', 'lithuania']):

    data = []

    for country in list_countries:
        r = requests.get(f'https://api.covid19api.com/country/{country}')
        data_dic = r.json()
        raw_data = pd.DataFrame.from_dict(data_dic)

        filter_province = raw_data['Province'] != ''
        raw_data.drop(index = raw_data[filter_province].index, inplace = True)

        filter_columns = ['CountryCode', 'Province', 'City', 'CityCode', 'Lat', 'Lon', 'Active']
        raw_data.drop(columns = filter_columns, inplace = True)

        raw_data['Date'] = pd.to_datetime(raw_data['Date'], format = '%Y-%m-%dT%H:%M:%SZ')
        
        numeric_columns = ['Confirmed', 'Deaths', 'Recovered']
        raw_data[numeric_columns] = raw_data[numeric_columns].apply(pd.to_numeric)

        data.append(raw_data)

    data = pd.concat(data)

    column_names = ['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered']
    data = data.reindex(columns = column_names)

    filter_date = data['Date'] >= '2020-03-01'
    data = data[filter_date]

    country_group = data.groupby('Country')

    return country_group

In [71]:
data = timeSeriesData()
data


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015B7B26B948>

To see more insights of the current situation, let's extract cross-sectional data from the [NovelCOVID API](https://github.com/javieraviles/covidAPI)

In [187]:
def crossSectionalData():
    
    r = requests.get('https://coronavirus-19-api.herokuapp.com/countries')
    cs_data_dic = r.json()
    cs_data = pd.DataFrame.from_dict(cs_data_dic)
    
    cs_data = cs_data.loc[8:218]
    
    cs_data = cs_data.apply(pd.to_numeric, errors = 'ignore')
    cs_data['Population (M)'] = round((cs_data['cases'] / cs_data['casesPerOneMillion']), 2)
    
    cs_data['Population (M)'].replace([np.inf, 0], np.nan, inplace = True)
    cs_data.dropna(axis = 0, subset = ['Population (M)'], inplace = True)
    
    filter_columns = ['todayCases', 'todayDeaths', 'active', 'critical', 'casesPerOneMillion', 'deathsPerOneMillion', 'testsPerOneMillion']
    cs_data.drop(columns = filter_columns, inplace = True)
    
    cs_data.reset_index(drop = True, inplace = True)
    
    return cs_data

In [188]:
crossSectionalData()

Unnamed: 0,country,cases,deaths,recovered,totalTests,Population (M)
0,USA,925758,52217,110432.0,5037473,330.98
1,Spain,219764,22524,92355.0,930230,46.76
2,Italy,192994,25969,60498.0,1642356,60.46
3,France,159828,22245,43493.0,463662,65.26
4,Germany,155054,5767,109800.0,2072669,83.77
5,UK,143464,19506,,612031,67.90
6,Turkey,104912,2600,21737.0,830257,84.33
7,Iran,89328,5650,68193.0,410075,83.95
8,Russia,74588,681,6250.0,2721500,145.96
9,Brazil,54043,3704,27655.0,291922,212.77


For some reason China is missing. Some countries are reporting 0 cases per million of populations, hence their population is being astimated to be equal to infinity. As thsese countries do not impact the analysis much, I just remove these observations from the dataset. 

In [199]:
def aggregateOtherCountries(number_of_countries, include_other = True):
    
    cs_data = crossSectionalData()
    lower_bound = number_of_countries + 1
    
    agg_part = cs_data.iloc[lower_bound:]
    non_agg_part = cs_data.iloc[:number_of_countries] 
    
    other_row = agg_part.sum(numeric_only=True)
    other_row['country'] = 'Other countries*'
    
    if (include_other):
        agg_data = non_agg_part.append(other_row, ignore_index=True)
    else:
        agg_data = non_agg_part
        
    return agg_data


In [204]:
data = aggregateOtherCountries(2, False)
data

Unnamed: 0,country,cases,deaths,recovered,totalTests,Population (M)
0,USA,925758,52217,110432.0,5037473,330.98
1,Spain,219764,22524,92355.0,930230,46.76
