# Assess Covid-19 data
***

## Table of Contents
<ul>
<li><a href="#manual_processing">Manual Processing (only confirmed)</a></li>
<li><a href="#automated_processing">Automated Processingg (all data)</a></li>
<li><a href="#sqlite">Store clean data in SQLite DB</a></li>
</ul>

In [30]:
from datetime import date, datetime, timedelta

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

# function for cleaning
from src.data.clean_dataset import cleanData
cd = cleanData()

from sqlalchemy import create_engine
%load_ext sql

%load_ext autoreload
%autoreload 2

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<a id='manual_processing'></a>
## 1. Manual Processing
***

In [31]:
df = pd.read_csv('../data/raw/global_confirmed.csv')
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,2335,2469,2704,2894,3224,3392,3563,3778,4033,4402
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,782,789,795,803,820,832,842,850,856,868
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,4154,4295,4474,4648,4838,4997,5182,5369,5558,5723
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,745,747,748,750,751,751,752,752,754,755
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,30,35,35,35,36,36,36,43,43,45


#### Rename Columns
Change to all lower-case and single terms.

In [32]:
df.rename(columns = {'Country/Region' : 'country',
                     'Province/State' : 'state',
                     'Lat' : 'lat',
                     'Long' : 'long'}, inplace=True)

# check
df.head(2)

Unnamed: 0,state,country,lat,long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,2335,2469,2704,2894,3224,3392,3563,3778,4033,4402
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,782,789,795,803,820,832,842,850,856,868


#### Get separate df for state, country, lat, long
***
Capture country, state (included since lat/long data is on state level), lat, long


In [33]:
country_data = df[['country','state','lat','long']]
country_data.head()

Unnamed: 0,country,state,lat,long
0,Afghanistan,,33.0,65.0
1,Albania,,41.1533,20.1683
2,Algeria,,28.0339,1.6596
3,Andorra,,42.5063,1.5218
4,Angola,,-11.2027,17.8739


#### Aggregate data per country
***
* drop lat, long columns (no aggregation needed here)
* Sum daily values over all provinces for a country

In [34]:
# drop lat/long
df.drop(['lat','long'], axis=1, inplace=True)
df.head()

Unnamed: 0,state,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20
0,,Afghanistan,0,0,0,0,0,0,0,0,...,2335,2469,2704,2894,3224,3392,3563,3778,4033,4402
1,,Albania,0,0,0,0,0,0,0,0,...,782,789,795,803,820,832,842,850,856,868
2,,Algeria,0,0,0,0,0,0,0,0,...,4154,4295,4474,4648,4838,4997,5182,5369,5558,5723
3,,Andorra,0,0,0,0,0,0,0,0,...,745,747,748,750,751,751,752,752,754,755
4,,Angola,0,0,0,0,0,0,0,0,...,30,35,35,35,36,36,36,43,43,45


In [35]:
# group by coutry and store in new df
df_country = df.groupby('country').sum().reset_index()

# check
assert df[df['country']=='Australia']['5/7/20'].sum() == df_country[df_country['country']=='Australia']['5/7/20'].iloc[0]

In [36]:
df_country.head()

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,2335,2469,2704,2894,3224,3392,3563,3778,4033,4402
1,Albania,0,0,0,0,0,0,0,0,0,...,782,789,795,803,820,832,842,850,856,868
2,Algeria,0,0,0,0,0,0,0,0,0,...,4154,4295,4474,4648,4838,4997,5182,5369,5558,5723
3,Andorra,0,0,0,0,0,0,0,0,0,...,745,747,748,750,751,751,752,752,754,755
4,Angola,0,0,0,0,0,0,0,0,0,...,30,35,35,35,36,36,36,43,43,45


#### Move dates to a single row
Transpose table - using 'melt' function

In [37]:
df2 = df_country.melt(id_vars=["country"], 
                      var_name="date", 
                      value_name="confirmed")

# check
assert df2[(df2['country']=='Australia') & (df2['date']=='5/7/20')]['confirmed'].iloc[0] ==df_country[df_country['country']=='Australia']['5/7/20'].iloc[0]

# view
df2.head()

Unnamed: 0,country,date,confirmed
0,Afghanistan,1/22/20,0
1,Albania,1/22/20,0
2,Algeria,1/22/20,0
3,Andorra,1/22/20,0
4,Angola,1/22/20,0


#### Change date format
* Use string processing to change format to `YYYY-MM-DD`
* change data-format to datetime

In [38]:
# extract individual date elements
date_parts = df2.date.str.split('/', expand=True) # month
date_parts.columns = ['month','day','year']

# pad month and day to 2 digits
date_parts.month = date_parts.month.str.pad(width=2, side='left',fillchar='0')
date_parts.day = date_parts.day.str.pad(width=2, side='left',fillchar='0')

# add '20' to year
date_parts.year = np.repeat('20',len(date_parts)) + date_parts.year

# get the full date
full_date = date_parts['year'] + '-' + date_parts['month'] + '-' + date_parts['day']

In [39]:
# replace date column with clean date
df2['date'] = full_date
df2['date'] = pd.to_datetime(df2['date'], format="%Y-%m-%d")

In [40]:
df2.head()

Unnamed: 0,country,date,confirmed
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0
2,Algeria,2020-01-22,0
3,Andorra,2020-01-22,0
4,Angola,2020-01-22,0


In [41]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20570 entries, 0 to 20569
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   country    20570 non-null  object        
 1   date       20570 non-null  datetime64[ns]
 2   confirmed  20570 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 482.2+ KB


#### Change country 'US' to 'United States'
replace US with united states

In [42]:
df2.replace('US','United States', inplace=True)

In [43]:
# check - should
assert len(df2[df2['country']=='US']) == 0

# visual check
df2[df2['country']=='United States']

Unnamed: 0,country,date,confirmed
173,United States,2020-01-22,1
360,United States,2020-01-23,1
547,United States,2020-01-24,2
734,United States,2020-01-25,2
921,United States,2020-01-26,5
...,...,...,...
19808,United States,2020-05-06,1229331
19995,United States,2020-05-07,1257023
20182,United States,2020-05-08,1283929
20369,United States,2020-05-09,1309550


<a id='automated_processing'></a>
## 2. Automated Processing
***
* Automate the steps above
* Apply to `confirmed` (above), `deaths` and `recovered` (all have same structure)
* Combine dataframes to get 2 outputs: country (country, state, lat, long) and covid (country, date, confirmed, death, recovered)

#### confirmed

In [44]:
# collect raw data
confirmed_raw = pd.read_csv('../data/raw/global_confirmed.csv')

# clean the data
confirmed_cleaned = cd.cleanData(confirmed_raw,'confirmed')
confirmed_cleaned.head(2)

Unnamed: 0,country,date,confirmed
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0


#### deaths

In [45]:
# collect raw data
death_raw = pd.read_csv('../data/raw/global_deaths.csv')

# clean the data
death_cleaned = cd.cleanData(death_raw,'death')
death_cleaned.head(2)

Unnamed: 0,country,date,death
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0


#### recovered

In [46]:
# collect raw data
recovered_raw = pd.read_csv('../data/raw/global_recovered.csv')

# clean the data
recovered_cleaned = cd.cleanData(recovered_raw,'recovered')
recovered_cleaned.head(2)

Unnamed: 0,country,date,recovered
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0


#### combine dataframes

In [47]:
confirmed_cleaned.shape

(20570, 3)

In [48]:
death_cleaned.shape

(20570, 3)

In [49]:
recovered_cleaned.shape

(20570, 3)

In [50]:
# combine the dataframes
combined = confirmed_cleaned.merge(death_cleaned, on = ['country','date'], how='inner').merge(recovered_cleaned, on = ['country','date'], how='inner')
combined.tail()

Unnamed: 0,country,date,confirmed,death,recovered
20565,West Bank and Gaza,2020-05-10,375,2,263
20566,Western Sahara,2020-05-10,6,0,5
20567,Yemen,2020-05-10,51,8,1
20568,Zambia,2020-05-10,267,7,117
20569,Zimbabwe,2020-05-10,36,4,9


In [51]:
# create the country dataframe
country_data = confirmed_raw[['Province/State','Country/Region','Lat','Long']]
country_data.columns = ['country','state','lat','long']
country_data.head()

Unnamed: 0,country,state,lat,long
0,,Afghanistan,33.0,65.0
1,,Albania,41.1533,20.1683
2,,Algeria,28.0339,1.6596
3,,Andorra,42.5063,1.5218
4,,Angola,-11.2027,17.8739


<a id='sqlite'></a>
## 3. Store dataframes in SQLite DB
***
From above, we need to store combined and country data

#### create DB

In [52]:
# database parameters
driver = 'sqlite'
filename = '../data/processed/covid.sqlite'
driver+":///"+filename

'sqlite:///../data/processed/covid.sqlite'

In [24]:
# make the connection
engine = create_engine(driver+":///"+filename)
connection = engine.connect()

In [25]:
%sql sqlite:///../data/processed/covid.sqlite

In [26]:
%%sql
-- if there are tables - we need to drop them first otherwise we'll duplicate.
DROP TABLE IF EXISTS stats;
DROP TABLE IF EXISTS country;

 * sqlite:///../data/processed/covid.sqlite
Done.
Done.


[]

In [27]:
# check for a proper connection - we expect no tables to be present yet!
print(engine.table_names())

[]


#### load tables

In [28]:
# load tables in DB
combined.to_sql('stats', con = engine, if_exists = 'append', index=False, chunksize = 1000)
country_data.to_sql('country', con = engine, if_exists = 'append', index=False, chunksize = 1000)

In [29]:
# check stats table
stats_len = %sql SELECT COUNT(*) FROM stats
assert len(combined) == stats_len[0][0]

# check country table
country_len = %sql SELECT COUNT(*) FROM country
assert len(country_data) == country_len[0][0]

 * sqlite:///../data/processed/covid.sqlite
Done.
 * sqlite:///../data/processed/covid.sqlite
Done.
