# Main Covid-19 data
***
Objectives:
* Country Level
* Daily Frequency


In [1]:
import numpy as np
import pandas as pd

# allow web-acces for downloading: https://stackoverflow.com/a/60671292
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from sqlalchemy import create_engine
%load_ext sql

%load_ext autoreload
%autoreload 2

In [2]:
from src.data.process_data import cleanMainDataset

sqlite:///../../data/processed/covid_db.sqlite


> Workaround for SSL issue in venv: https://stackoverflow.com/a/60671292


## 1. Gather Data
***
We can get data from John Hopkin's University's Centre for Systems Science and Engineering's [Github page](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series).

The data is updated on a daily frequency and contains data per country (often sub-country level as well), hence suiting our requirements. The data is split in 5 files: 
* 3x global: confirmed, deaths, recovered
* 2x us: confirmed, deaths)

#### 1.1 Download the files

In [3]:
# base url to download csv data from github
base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

# file-specific url
files = {
    'global_confirmed' : 'time_series_covid19_confirmed_global.csv',
    'global_deaths' : 'time_series_covid19_deaths_global.csv',
    'global_recovered' : 'time_series_covid19_recovered_global.csv'
}

In [4]:
# global_confirmed
global_confirmed = pd.read_csv(base_url + files['global_confirmed'])
global_confirmed.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,32324,32672,32951,33190,33384,33594,33908,34194,34366,34451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2752,2819,2893,2964,3038,3106,3188,3278,3371,3454


In [5]:
# global_deaths
global_deaths = pd.read_csv(base_url + files['global_deaths'])
global_deaths.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,819,826,864,898,920,936,957,971,994,1010
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,72,74,76,79,81,83,83,85,89,93


In [6]:
# global_recovered
global_recovered = pd.read_csv(base_url + files['global_recovered'])
global_recovered.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,17331,19164,19366,20103,20179,20700,20847,20882,21135,21216
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,1592,1637,1657,1702,1744,1791,1832,1875,1881,1946


#### 1.2 temporary storage raw files

In [7]:
raw_folder = '../../data/raw/'

global_confirmed.to_csv(raw_folder + 'global_confirmed.csv', index=False)
global_deaths.to_csv(raw_folder + 'global_deaths.csv', index=False)
global_recovered.to_csv(raw_folder + 'global_recovered.csv', index=False)

## 2. Assess Data
***

In [8]:
global_confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,32324,32672,32951,33190,33384,33594,33908,34194,34366,34451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2752,2819,2893,2964,3038,3106,3188,3278,3371,3454
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,15070,15500,15941,16404,16879,17348,17808,18242,18712,19195
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,855,855,855,855,855,855,855,855,855,855
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,328,346,346,346,386,386,396,458,462,506


### issues to address
1. Rename columns (lowercase, single words)
2. Move meta-data (`Lat`/`Long`) to separate dataframe
3. Aggregate to country level (No `Province/State` needed)
4. Move dates to a single row (unpivot - tidy data requirements)
5. Change date structure to ISO (`yyyy-mm-dd`)

#### 2.1 Rename Columns

In [9]:
global_confirmed.rename(columns = {'Country/Region' : 'country',
                                   'Province/State' : 'state',
                                   'Lat' : 'lat',
                                   'Long' : 'long'}, inplace = True)

# check
global_confirmed.head(2)

Unnamed: 0,state,country,lat,long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,32324,32672,32951,33190,33384,33594,33908,34194,34366,34451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2752,2819,2893,2964,3038,3106,3188,3278,3371,3454


#### 2.2 Move meta-data

In [10]:
country_data = global_confirmed[['country','state','lat','long']]
country_data.head()

Unnamed: 0,country,state,lat,long
0,Afghanistan,,33.0,65.0
1,Albania,,41.1533,20.1683
2,Algeria,,28.0339,1.6596
3,Andorra,,42.5063,1.5218
4,Angola,,-11.2027,17.8739


In [11]:
# drop lat/long
global_confirmed.drop(['lat','long'], axis = 1, inplace = True)
global_confirmed.head()

Unnamed: 0,state,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,,Afghanistan,0,0,0,0,0,0,0,0,...,32324,32672,32951,33190,33384,33594,33908,34194,34366,34451
1,,Albania,0,0,0,0,0,0,0,0,...,2752,2819,2893,2964,3038,3106,3188,3278,3371,3454
2,,Algeria,0,0,0,0,0,0,0,0,...,15070,15500,15941,16404,16879,17348,17808,18242,18712,19195
3,,Andorra,0,0,0,0,0,0,0,0,...,855,855,855,855,855,855,855,855,855,855
4,,Angola,0,0,0,0,0,0,0,0,...,328,346,346,346,386,386,396,458,462,506


#### 2.3 Aggregate to country level

In [12]:
# group by coutry and store in new df
confirmed_country = global_confirmed.groupby('country').sum().reset_index()

# check
assert global_confirmed[global_confirmed['country'] =='Australia']['5/7/20'].sum() == confirmed_country[confirmed_country['country'] == 'Australia']['5/7/20'].iloc[0]

In [13]:
confirmed_country.head()

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,32324,32672,32951,33190,33384,33594,33908,34194,34366,34451
1,Albania,0,0,0,0,0,0,0,0,0,...,2752,2819,2893,2964,3038,3106,3188,3278,3371,3454
2,Algeria,0,0,0,0,0,0,0,0,0,...,15070,15500,15941,16404,16879,17348,17808,18242,18712,19195
3,Andorra,0,0,0,0,0,0,0,0,0,...,855,855,855,855,855,855,855,855,855,855
4,Angola,0,0,0,0,0,0,0,0,0,...,328,346,346,346,386,386,396,458,462,506


#### 2.4 Unpivot table

In [14]:
confirmed = confirmed_country.melt(id_vars = ["country"], 
                                  var_name = "date", 
                                value_name = "confirmed")

# check
assert confirmed[(confirmed['country'] == 'Australia') & (confirmed['date'] == '5/7/20')]['confirmed'].iloc[0] == confirmed_country[confirmed_country['country'] == 'Australia']['5/7/20'].iloc[0]

# view
confirmed.head()

Unnamed: 0,country,date,confirmed
0,Afghanistan,1/22/20,0
1,Albania,1/22/20,0
2,Algeria,1/22/20,0
3,Andorra,1/22/20,0
4,Angola,1/22/20,0


#### 2.5 Change date

In [15]:
confirmed['date'] = pd.to_datetime(confirmed['date'], format = '%m/%d/%y')

In [16]:
confirmed.tail()

Unnamed: 0,country,date,confirmed
32519,West Bank and Gaza,2020-07-12,6230
32520,Western Sahara,2020-07-12,10
32521,Yemen,2020-07-12,1465
32522,Zambia,2020-07-12,1895
32523,Zimbabwe,2020-07-12,985


#### 2.6 Remove inconsistent countries
Manual check of countries revealed that some countries are not actual countries. Examples are Kosovo (doesn't exist anymore) and two cruise-ships, hit early-on in the Covid crisis.

In [17]:
confirmed = confirmed[~confirmed['country'].isin(['Diamond Princess','MS Zaandam','Kosovo'])]
assert len(confirmed[confirmed['country'].isin(['Diamond Princess','MS Zaandam','Kosovo'])]) == 0

#### 2.7 Update names (ITERATION 2)
Whilst working with the population data (see relevant notebook), we took a closer look at the exact names as these are used to merge with this dataset. We want to clean the following names (dictionary has the current name as key and the target name as value).

In [18]:
trans_stats = {
    "Cote d'Ivoire" : 'Ivory Coast',
    'Burma' : 'Myanmar',
    'Congo (Brazzaville)' : 'Congo',
    'Congo (Kinshasa)' : 'DR Congo',
    'West Bank and Gaza' : 'State of Palestine',
    'Taiwan*' : 'Taiwan',
    'Czechia' : 'Czech Republic',
    'Korea, South' : 'South Korea',
    'US' : 'United States'}

In [19]:
confirmed.replace(trans_stats, inplace=True)
assert len(confirmed[confirmed['country'].isin(list(trans_stats.keys()))]) == 0

## 3. Automate Processing & Apply it to other sets
***
The process in **section 2** has been automated in `process_data` package and is applied to the `deaths` and `recovered` datasets. 

#### 3.1 clean all three datasets

In [20]:
global_confirmed = pd.read_csv(base_url + files['global_confirmed'])

In [21]:
confirmed = cleanMainDataset(global_confirmed, 'confirmed')
confirmed.head()

Unnamed: 0,country,date,confirmed
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0
2,Algeria,2020-01-22,0
3,Andorra,2020-01-22,0
4,Angola,2020-01-22,0


In [22]:
death = cleanMainDataset(global_deaths, 'deaths')
death.head()

Unnamed: 0,country,date,deaths
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0
2,Algeria,2020-01-22,0
3,Andorra,2020-01-22,0
4,Angola,2020-01-22,0


In [23]:
recovered = cleanMainDataset(global_recovered, 'recovered')
recovered.head()

Unnamed: 0,country,date,recovered
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0
2,Algeria,2020-01-22,0
3,Andorra,2020-01-22,0
4,Angola,2020-01-22,0


#### 3.2 check datasets
Start, end-date and countries

In [24]:
confirmed.agg({'date' : ['min','max','count', 'nunique'],
               'country' : 'nunique'})

Unnamed: 0,date,country
count,32005,
max,2020-07-12,
min,2020-01-22,
nunique,173,185.0


In [25]:
death.agg({'date' : ['min','max','count', 'nunique'],
               'country' : 'nunique'})

Unnamed: 0,date,country
count,32005,
max,2020-07-12,
min,2020-01-22,
nunique,173,185.0


In [26]:
recovered.agg({'date' : ['min','max','count', 'nunique'],
               'country' : 'nunique'})

Unnamed: 0,date,country
count,32005,
max,2020-07-12,
min,2020-01-22,
nunique,173,185.0


#### 3.3 combine to represent target table

In [27]:
stats = confirmed.merge(death, on = ['country','date']).merge(recovered, on = ['country','date'])
stats.tail()

Unnamed: 0,country,date,confirmed,deaths,recovered
32000,State of Palestine,2020-07-12,6230,36,942
32001,Western Sahara,2020-07-12,10,1,8
32002,Yemen,2020-07-12,1465,417,659
32003,Zambia,2020-07-12,1895,42,1412
32004,Zimbabwe,2020-07-12,985,18,328


In [28]:
stats.shape

(32005, 5)

In [29]:
# manual check (with external source)
stats[stats['country'] == 'Malaysia'].tail()

Unnamed: 0,country,date,confirmed,deaths,recovered
31184,Malaysia,2020-07-08,8677,121,8486
31369,Malaysia,2020-07-09,8683,121,8499
31554,Malaysia,2020-07-10,8696,121,8511
31739,Malaysia,2020-07-11,8704,122,8515
31924,Malaysia,2020-07-12,8718,122,8519


## 4. Store Data in a DB
***
Sqlite:
* 2 tables from here: stats & population

#### 4.1 Create sqlite DB

In [30]:
# database parameters
driver = 'sqlite'
filename = '../../data/processed/covid_db.sqlite'
driver+":///"+filename

'sqlite:///../../data/processed/covid_db.sqlite'

In [31]:
# make the connection
engine = create_engine(driver+":///"+filename)
connection = engine.connect()

In [32]:
%sql sqlite:///../../data/processed/covid_db.sqlite

#### 4.2 Create stats table

In [33]:
%%sql sqlite:///../../data/processed/covid_db.sqlite
-- create the table to store our stats df
DROP TABLE IF EXISTS stats;

CREATE TABLE stats (
    country varchar NOT NULL,
    date date(1) NOT NULL,
    confirmed int,
    deaths int,
    recovered int,
    PRIMARY KEY (country, date));

Done.
Done.


[]

In [34]:
# upload df to our table
stats.to_sql('stats', con = engine, if_exists = 'append', index=False, chunksize = 1000)

In [35]:
%%sql sqlite:///../../data/processed/covid_db.sqlite
SELECT *
  FROM stats
 WHERE country = 'United States'
 ORDER BY date DESC
 LIMIT 5;

Done.


country,date,confirmed,deaths,recovered
United States,2020-07-12,3304942,135205,1006326
United States,2020-07-11,3245925,134777,995576
United States,2020-07-10,3184573,134092,983185
United States,2020-07-09,3117946,133290,969111
United States,2020-07-08,3054699,132300,953462
