First, let's import all the packages we will need for this project

In [3]:
import requests
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

Configure pandas

In [4]:
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', None)

To see the disease development over time, let's access the Covid-19 data API kindly provided by the [Johns Hopkins CSSE](https://covid19api.com/).

[Postman COVID-19 API Resource Center](https://covid-19-apis.postman.com/)

In [5]:
def timeSeriesInstructions():
    
    r = requests.get('https://api.covid19api.com/')
    instructions_dic = r.json()
    instructions = pd.DataFrame.from_dict(instructions_dic).transpose()
    
    return instructions

In [23]:
def testTimeSeriesInstructions():
    
    instructions = timeSeriesInstructions()
    assert instructions.empty == False

testTimeSeriesInstructions()

In [7]:
def filterDate(data, date = '2020-03-01'):
    
    filter_date = data['Date'] >= date
    data = data[filter_date]
    
    return data

In [8]:
def timeSeriesData(list_countries = ['estonia', 'latvia', 'lithuania']):

    data = []

    for country in list_countries:
        r = requests.get(f'https://api.covid19api.com/country/{country}')
        data_dic = r.json()
        raw_data = pd.DataFrame.from_dict(data_dic)

        filter_province = raw_data['Province'] != ''
        raw_data.drop(index = raw_data[filter_province].index, inplace = True)

        filter_columns = ['CountryCode', 'Province', 'City', 'CityCode', 'Lat', 'Lon', 'Active']
        raw_data.drop(columns = filter_columns, inplace = True)

        raw_data['Date'] = pd.to_datetime(raw_data['Date'], format = '%Y-%m-%dT%H:%M:%SZ')
        
        numeric_columns = ['Confirmed', 'Deaths', 'Recovered']
        raw_data[numeric_columns] = raw_data[numeric_columns].apply(pd.to_numeric)

        data.append(raw_data)

    data = pd.concat(data)

    column_names = ['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered']
    data = data.reindex(columns = column_names)

    data = filterDate(data)
    country_group = data.groupby('Country')    

    return country_group

In [10]:
def pickCountry(country_group, country_name):
    
    country_name = country_name.title()
    ts_data = country_group.get_group(country_name)
    ts_data.reset_index(drop = True, inplace = True)
    
    return ts_data

In [57]:
def testTimeSeriesData():
    
    try:
        test = timeSeriesData()
        test = pickCountry(test, 'Latvia')
    
        assert test['Date'].iloc[0] == datetime.strptime('2020-03-01', '%Y-%m-%d')
        assert list(test.columns) == ['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered']
        assert type(test['Confirmed'].iloc[0]) == np.int64
    except:
        print('Assertion error detected. Please, inspect testTimeSeriesData() function.')
        
testTimeSeriesData()

In [11]:
def dailyChanges(ts_data, prc = False, periods = 1):
    numeric_columns = ['Confirmed', 'Deaths', 'Recovered']
    legend_data = ts_data[['Country', 'Date']]

    if (prc):
        change_data = round(100 * ts_data[numeric_columns].pct_change(periods = periods), 2)
        change_data.replace([np.inf, np.nan], 0, inplace = True)
    else:
        change_data = ts_data[numeric_columns].diff(periods = periods)
        change_data.replace([np.inf, np.nan], 0, inplace = True)
        change_data = change_data.astype(int)

    diff_data = pd.concat([legend_data, change_data], axis = 1)
    
    upper_bound = diff_data.index[:periods]
    diff_data.drop(labels = upper_bound, axis = 0, inplace = True)
    
    diff_data.reset_index(drop = True, inplace = True)

    return diff_data

To see more insights of the current situation, let's extract cross-sectional data from the [NovelCOVID API](https://github.com/javieraviles/covidAPI)

In [13]:
def crossSectionalData():
    
    r = requests.get('https://coronavirus-19-api.herokuapp.com/countries')
    cs_data_dic = r.json()
    cs_data = pd.DataFrame.from_dict(cs_data_dic)
    
    cs_data = cs_data.loc[8:218]
    
    cs_data = cs_data.apply(pd.to_numeric, errors = 'ignore')
    cs_data['Population (M)'] = round((cs_data['cases'] / cs_data['casesPerOneMillion']), 2)
    
    cs_data['Population (M)'].replace([np.inf, 0], np.nan, inplace = True)
    cs_data.dropna(axis = 0, subset = ['Population (M)'], inplace = True)
    
    filter_columns = ['todayCases', 'todayDeaths', 'active', 'critical', 'casesPerOneMillion', 'deathsPerOneMillion', 'testsPerOneMillion']
    cs_data.drop(columns = filter_columns, inplace = True)
    
    cs_data.reset_index(drop = True, inplace = True)
    
    return cs_data

For some reason China is missing. Some countries are reporting 0 cases per million of populations, hence their population is being astimated to be equal to infinity. As thsese countries do not impact the analysis much, I just remove these observations from the dataset. 

In [14]:
def aggregateOtherCountries(number_of_countries, include_other = True):
    
    cs_data = crossSectionalData()
    lower_bound = number_of_countries + 1
    
    agg_part = cs_data.iloc[lower_bound:]
    non_agg_part = cs_data.iloc[:number_of_countries] 
    
    other_row = agg_part.sum(numeric_only=True)
    other_row['country'] = 'Other countries*'
    
    if (include_other):
        agg_data = non_agg_part.append(other_row, ignore_index=True)
    else:
        agg_data = non_agg_part
        
    return agg_data


In [15]:
aggregateOtherCountries(number_of_countries = 10, include_other = True)

Unnamed: 0,country,cases,deaths,recovered,totalTests,Population (M)
0,USA,960896.0,54265.0,118162.0,5279237.0,331.0
1,Spain,223759.0,22902.0,95708.0,930230.0,46.75
2,Italy,195351.0,26384.0,63120.0,1707743.0,60.46
3,France,161488.0,22614.0,44594.0,463662.0,65.27
4,Germany,156513.0,5877.0,109800.0,2072669.0,83.79
5,UK,148377.0,20319.0,,640792.0,67.88
6,Turkey,107773.0,2706.0,25582.0,868565.0,84.33
7,Iran,90481.0,5710.0,69657.0,421313.0,84.01
8,Russia,80949.0,747.0,6767.0,2877699.0,145.85
9,Brazil,59324.0,4057.0,29160.0,291922.0,212.63


timeSeriesInstructions()

filterDate(data, date = '2020-03-01')

timeSeriesData(list_countries = \['estonia', 'latvia', 'lithuania'\])

pickCountry(country_group, country_name)

dailyChanges(ts_data, prc = False, periods = 1)


crossSectionalData()

aggregateOtherCountries(number_of_countries, include_other = True)
