# Gather data on Covid-19
***
Requirements:
* Country level
* Daily frequency
* Infected, Death & Recovered (levels)

In [2]:
! pip install numpy pandas matplotlib seaborn

[33mYou are using pip version 19.0.3, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
from datetime import date

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

# allow web-acces for downloading: https://stackoverflow.com/a/60671292
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

%load_ext autoreload
%autoreload 2


## Source
***
We can get data from John Hopkin's University's Centre for Systems Science and Engineering's [Github page](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series).

The data is updated on a daily frequency and contains data per country (often sub-country level as well), hence suiting our requirements. The data is split in 5 files: 
* 3x global: confirmed, deaths, recovered
* 2x us: confirmed, deaths)

In [4]:
# base url to download csv data from github
base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

# file-specific url
files = {
    'global_confirmed' : 'time_series_covid19_confirmed_global.csv',
    'global_deaths' : 'time_series_covid19_deaths_global.csv',
    'global_recovered' : 'time_series_covid19_recovered_global.csv',
    'us_confirmed' : 'time_series_covid19_confirmed_US.csv',
    'us_deaths' : 'time_series_covid19_deaths_US.csv'
}

## Download the files
***
Workaround for SSL issue in venv: https://stackoverflow.com/a/60671292


In [5]:
# global_confirmed
global_confirmed = pd.read_csv(base_url + files['global_confirmed'])
global_confirmed.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,1828,1939,2171,2335,2469,2704,2894,3224,3392,3563
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,750,766,773,782,789,795,803,820,832,842


In [6]:
# global_deaths
global_deaths = pd.read_csv(base_url + files['global_deaths'])
global_deaths.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,58,60,64,68,72,85,90,95,104,106
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,30,30,31,31,31,31,31,31,31,31


In [7]:
# global_recovered
global_recovered = pd.read_csv(base_url + files['global_recovered'])
global_recovered.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,228,252,260,310,331,345,397,421,458,468
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,431,455,470,488,519,531,543,570,595,605


In [8]:
# us_confirmed
us_confirmed = pd.read_csv(base_url + files['us_confirmed'])
us_confirmed.head(2)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,141,141,145,145,145,145,145,145,149,149


In [9]:
# us_deaths
us_deaths = pd.read_csv(base_url + files['us_deaths'])
us_deaths.head(2)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,5,5,5,5,5,5,5,5,5,5


## Store downloaded data
***
Use `data/raw` folder

In [10]:
# get the current date - add to the files prior to saving
current_date = date.today().strftime("%Y-%m-%d")

# list of dataframe
dfs = [global_confirmed, global_deaths, global_recovered, us_confirmed, us_deaths]

# output parameters
output_base = '../data/raw/'
output_filenames = ['global_confirmed', 'global_deaths', 'global_recovered', 
                    'us_confirmed', 'us_deaths']

In [14]:
# store the files
for i in range(len(dfs)):
    dfs[i].to_csv(output_base + output_filenames[i] + '.csv', index=False)
    #dfs[i].to_csv(output_base + output_filenames[i] + '_' + current_date + '.csv')

    print('stored:  ',output_base + output_filenames[i] + '.csv')

stored:   ../data/raw/global_confirmed.csv
stored:   ../data/raw/global_deaths.csv
stored:   ../data/raw/global_recovered.csv
stored:   ../data/raw/us_confirmed.csv
stored:   ../data/raw/us_deaths.csv
