# Demographic data
***
Objective: get background on the countries, including at least a country's population (to scale Covid cases).

In [68]:
import numpy as np
import pandas as pd

from sqlalchemy import create_engine
%load_ext sql

from src.data.quick_queries import queryDB
qdb = queryDB('sqlite','../../data/processed/covid_db.sqlite')


%load_ext autoreload
%autoreload 2

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
sqlite:///../../data/processed/covid_db.sqlite
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Gather Data
***
Population data for 2020, and other demographic data can be found here: https://www.worldometers.info/world-population/population-by-country/

Whilst we mostly want the population data, we would include the other parameters as well whilst we have them for potential use in EDA.

#### 1.1. Download files & load data
***
Data is manually copied and stored in `data/raw/global_pop.csv`

In [4]:
df = pd.read_csv('../../data/raw/global_pop.csv')
df.head()

Unnamed: 0,rank,country,populations,yearly_change,net_change,density,land_area,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct
0,1,China,1439323776,0.39%,5540090,153,9388211,-348399,1.7,38,61%,18.47%
1,2,India,1380004385,0.99%,13586631,464,2973190,-532687,2.2,28,35%,17.70%
2,3,United States,331002651,0.59%,1937734,36,9147420,954806,1.8,38,83%,4.25%
3,4,Indonesia,273523615,1.07%,2898047,151,1811570,-98955,2.3,30,56%,3.51%
4,5,Pakistan,220892340,2.00%,4327022,287,770880,-233379,3.6,23,35%,2.83%


#### 1.2 Store the (raw) data in our SQL DB
***
For illustrative purposes, we will clean this data in SQL.

In [7]:
# database parameters
driver = 'sqlite'
filename = '../../data/processed/covid_db.sqlite'
driver+":///"+filename

'sqlite:///../../data/processed/covid_db.sqlite'

In [8]:
# make the connection
engine = create_engine(driver+":///"+filename)
connection = engine.connect()

In [9]:
%sql sqlite:///../../data/processed/covid_db.sqlite

In [10]:
%%sql
/*
create the table to store our raw population data
most numbers are formatted and stored as varchar
ignore length of varchar as this raw table will be dropped at end
*/

DROP TABLE IF EXISTS population_raw;

CREATE TABLE population_raw (
    rank int,
    country varchar,
    populations varchar,
    yearly_change varchar,
    net_change varchar,
    density int,
    land_area varchar,
    migrants varchar,
    fert_rate numeric,
    med_age int,
    urban_pop_pct varchar,
    world_share_pct varchar,
    PRIMARY KEY (rank, country));

 * sqlite:///../../data/processed/covid_db.sqlite
Done.
Done.


[]

In [11]:
# upload df to our table
df.to_sql('population_raw', con = engine, if_exists = 'append', index=False, chunksize = 1000)

## 2. Assess the Data
***
We need to re-format this data on it's own and ensure we can join this to the `stats` table on `country`.

#### 2.1 Assess Data

In [24]:
%%sql
/*first rows*/

SELECT *
  FROM population_raw
 LIMIT 5;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


rank,country,populations,yearly_change,net_change,density,land_area,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct
1,China,1439323776,0.39%,5540090,153,9388211,-348399,1.7,38,61%,18.47%
2,India,1380004385,0.99%,13586631,464,2973190,-532687,2.2,28,35%,17.70%
3,United States,331002651,0.59%,1937734,36,9147420,954806,1.8,38,83%,4.25%
4,Indonesia,273523615,1.07%,2898047,151,1811570,-98955,2.3,30,56%,3.51%
5,Pakistan,220892340,2.00%,4327022,287,770880,-233379,3.6,23,35%,2.83%


#### 2.2 Assess joining keys
***
Note: no FULL joins in SQLite

In [64]:
%%sql
SELECT DISTINCT stats.country AS not_in_population_raw
  FROM stats 
       LEFT JOIN population_raw
         ON population_raw.country = stats.country
 WHERE population_raw.country IS NULL

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


not_in_population_raw
Czech Republic
Ivory Coast
Saint Kitts and Nevis
Saint Vincent and the Grenadines
Sao Tome and Principe


In [65]:
%%sql
SELECT COUNT(DISTINCT stats.country) AS all_stats_countries,
       COUNT(DISTINCT population_raw.country) AS countries_found_in_population
  FROM stats 
       LEFT JOIN population_raw
         ON population_raw.country = stats.country;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


all_stats_countries,countries_found_in_population
185,180


In [66]:
%%sql
SELECT DISTINCT population_raw.country AS not_in_stats,
       populations
  FROM population_raw 
       LEFT JOIN stats
         ON population_raw.country = stats.country
 WHERE stats.country IS NULL
 ORDER BY rank
 LIMIT 20;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


not_in_stats,populations
Côte d'Ivoire,26378274
North Korea,25778816
Czech Republic (Czechia),10708981
Hong Kong,7496981
Turkmenistan,6031200
Puerto Rico,2860853
Réunion,895312
Solomon Islands,686884
Macao,649335
Guadeloupe,400124


#### 2.3 DATA ASSESSMENT NOTES
**Required changes per column:**
* populations: remove ‘,’ and convert to integer
* yearly_change: remove ‘%’, convert to numeric, change name to ‘yearly_change_pct’
* net_change: remove ‘,’ and convert to integer
* Density: convert to integer
* land_area:  remove ‘,’ and convert to integer
* migrants:  remove ‘,’ and convert to integer
* fert_rate: convert to numeric
* med_age convert to integer
* urban_pop_pct: remove ‘%’ and convert to integer
* world_share_pct: remove ‘%’ and convert to numeric

**With respect to the joining keys:**
* 180 out of 185 are correct!
* For the remaining 5, we can trace back the correct names (i.e. Czech Republic (Czechia) -> Czech Republic)
* No data in stats for the following countries:
    * Most notable exceptions are North Korea and Turkmenistan (most likely due to the lack of transpartent data)
    * Other countries not covered in stats are often subsidiaries of kingdoms (i.e. Caribbean Netherlands, British Virgin Islands, New Caledonia, Puerto Rico) - we will not pursue further data gathering for these cases.


## 3. Clean Data
***

#### 3.1 Update formats

In [43]:
%%sql
-- create clean populations table
DROP TABLE IF EXISTS populations;

CREATE TABLE populations AS
SELECT
    rank,
    country,
    CAST(REPLACE(populations,',','') AS integer) AS population,
    CAST(REPLACE(yearly_change,'%','') AS numeric) AS yearly_change_pct,
    CAST(REPLACE(net_change,',','') AS integer) AS net_change,
    CAST(density AS integer) AS density,
    CAST(REPLACE(land_area,',','') AS integer) AS land_are,
    CAST(REPLACE(migrants,',','') AS integer) AS migrants,
    CAST(fert_rate AS numeric) AS fert_rate,
    CAST(med_age AS integer) AS med_age,
    CAST(REPLACE(urban_pop_pct,'%','') AS integer) AS urban_pop_pct,
    CAST(REPLACE(world_share_pct,'%','') AS numeric) AS world_share_pct
FROM population_raw;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.
Done.


[]

In [44]:
%%sql
SELECT * FROM populations LIMIT 5;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


rank,country,population,yearly_change_pct,net_change,density,land_are,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct
1,China,1439323776,0.39,5540090,153,9388211,-348399,1.7,38,61,18.47
2,India,1380004385,0.99,13586631,464,2973190,-532687,2.2,28,35,17.7
3,United States,331002651,0.59,1937734,36,9147420,954806,1.8,38,83,4.25
4,Indonesia,273523615,1.07,2898047,151,1811570,-98955,2.3,30,56,3.51
5,Pakistan,220892340,2.0,4327022,287,770880,-233379,3.6,23,35,2.83


#### 3.2 Align merging keys
Based on the observed misalignment between the joining keys above, we wish to adjust both the stats table and the populations table. 

To run sql directly from python (i.e. in a loop), we use our `quick_queries.py` package (based on sqlalchemy).

In [48]:
%%sql
/*remove apostrophe*/

UPDATE populations
   SET country = replace(country, char(39), '')
 WHERE country LIKE 'C_te%';

 * sqlite:///../../data/processed/covid_db.sqlite
Done.
0 rows affected.


[]

In [73]:
def updateCountry(original, replacement):
    query = """
        UPDATE populations
        SET country = '{}'
        WHERE country = '{}';
        """.format(replacement, original)

    qdb.admin_query(query)

In [74]:
# translation table: current country --> target country
pop_stats = {
    'Czech Republic (Czechia)' : 'Czech Republic',
    'Côte dIvoire' : 'Ivory Coast',
    'Saint Kitts & Nevis' : 'Saint Kitts and Nevis',
    'St. Vincent & Grenadines' : 'Saint Vincent and the Grenadines',
    'Sao Tome & Principe' : 'Sao Tome and Principe'}

# run the update
for key in pop_stats.keys():
    updateCountry(key, pop_stats[key])

#### 3.3 Update Stats table
As mentioned above, we also need to update the stats table. This is done in the main notebook in section 2.7 (interatively added). 

#### 3.4 Finalise populations table

In [83]:
%%sql
--check formats
PRAGMA table_info(populations);

 * sqlite:///../../data/processed/covid_db.sqlite
0 rows affected.


[]

In [78]:
%%sql
SELECT * FROM populations
LIMIT 5;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


rank,country,population,yearly_change_pct,net_change,density,land_are,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct
1,China,1439323776,0.39,5540090,153,9388211,-348399,1.7,38,61,18.47
2,India,1380004385,0.99,13586631,464,2973190,-532687,2.2,28,35,17.7
3,United States,331002651,0.59,1937734,36,9147420,954806,1.8,38,83,4.25
4,Indonesia,273523615,1.07,2898047,151,1811570,-98955,2.3,30,56,3.51
5,Pakistan,220892340,2.0,4327022,287,770880,-233379,3.6,23,35,2.83


In [86]:
%%sql
--there should be no countries in stats that we cannot match in populations
SELECT DISTINCT stats.country AS not_in_population_raw
  FROM stats 
       LEFT JOIN populations
         ON populations.country = stats.country
 WHERE populations.country IS NULL

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


not_in_population_raw


#### 3.5 Remove `populations_raw`

In [99]:
%%sql
DROP TABLE population_raw;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


[]

In [100]:
qdb.engine.table_names()

['populations', 'stats']

## 4. Usage Example
***
We can use this new table to calculate the covid cases per 1 Million inhabitants. Let's do so for `2020-07-03` as example!

In [101]:
%%sql
SELECT stats.country,
       confirmed AS total_cases,
       ROUND(confirmed / (population*1.0/1000000),0) AS confirmed_per_M
  FROM stats
       JOIN populations
         ON stats.country = populations.country
 WHERE date = '2020-07-03' AND population > 5000000
 ORDER BY confirmed_per_M DESC
 LIMIT 10;

 * sqlite:///../../data/processed/covid_db.sqlite
Done.


country,total_cases,confirmed_per_M
Chile,288089,15070.0
Peru,295599,8965.0
Oman,43929,8602.0
United States,2794153,8441.0
Singapore,44479,7603.0
Brazil,1539081,7241.0
Sweden,71419,7072.0
Belarus,62997,6667.0
Saudi Arabia,201801,5797.0
Spain,250545,5359.0
