In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

In [4]:
# function for querying db
from src.data.query_db import queryDB
qdb = queryDB('sqlite','../data/processed/covid.sqlite')

sqlite:///../data/processed/covid.sqlite


## 1. Load Data
***
We've already collected country data on three levels in `data/raw`. We'll now take the most granular set (where we have the most countries) and captured the country-name and continent.

#### country - continent mapping

In [33]:
path = '../data/raw'
countries = gpd.read_file(path + '/' + 'ne_10m_admin_0_countries.shp')[['ADMIN','CONTINENT']]
countries.head(5)

Unnamed: 0,ADMIN,CONTINENT
0,Indonesia,Asia
1,Malaysia,Asia
2,Chile,South America
3,Bolivia,South America
4,Peru,South America


In [34]:
countries.groupby('CONTINENT').size()

CONTINENT
Africa                     54
Antarctica                  1
Asia                       59
Europe                     51
North America              42
Oceania                    26
Seven seas (open ocean)     9
South America              13
dtype: int64

#### our current countries
As this data is static, we'll add it to the populations table

In [35]:
qdb.engine.table_names()

['country', 'daily_stats', 'exp_stats', 'populations', 'stats']

In [36]:
query = """
    SELECT *
    FROM populations"""

df = qdb.execute_query(query)
df.head()

235 rows affected


Unnamed: 0,rank,country,population,yearly_change_pct,net_change,density,land_are,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct,scaled_pop
0,1,China,1439323776,0.39,5540090,153,9388211,-348399.0,1.7,38,61,18.47,1439.32
1,2,India,1380004385,0.99,13586631,464,2973190,-532687.0,2.2,28,35,17.7,1380.0
2,3,United States,331002651,0.59,1937734,36,9147420,954806.0,1.8,38,83,4.25,331.0
3,4,Indonesia,273523615,1.07,2898047,151,1811570,-98955.0,2.3,30,56,3.51,273.52
4,5,Pakistan,220892340,2.0,4327022,287,770880,-233379.0,3.6,23,35,2.83,220.89


## 2. Clean
***
Ensure our countries match

#### 2.1 assessment

In [37]:
merged = df.merge(countries, left_on = 'country', right_on = 'ADMIN', how = 'outer')

In [38]:
# missing in population-table

In [40]:
merged[merged['ADMIN'].isnull()][['country']]

Unnamed: 0,country
2,United States
15,DR Congo
23,Tanzania
85,Czech Republic
98,Serbia
103,Hong Kong
116,Congo
120,State of Palestine
147,North Macedonia
155,Timor-Leste


In [41]:
# missing from shapefile (match to the above)
merged[merged['country'].isnull()][['ADMIN','CONTINENT']].sort_values('ADMIN')

Unnamed: 0,ADMIN,CONTINENT
255,Akrotiri Sovereign Base Area,Asia
268,Aland,Europe
256,Antarctica,Antarctica
280,Ashmore and Cartier Islands,Oceania
281,Bajo Nuevo Bank (Petrel Is.),North America
254,Baykonur Cosmodrome,Asia
271,British Indian Ocean Territory,Seven seas (open ocean)
278,Clipperton Island,Seven seas (open ocean)
276,Coral Sea Islands,Oceania
252,Cyprus No Mans Area,Asia


#### 2.2 clean data
Create a translation dict, mapping the current value from the shapfile, to the correct value from our population table.

Note that 'overseas departments' are often not included as a separate country in the shapefile. Examples of this would be Guadeloupe (department of France). We're adding these to their gouvernemental continent, in this case: France.

Arguments could be made to count this with South America as well.

In [47]:
# translation table - target: current
country_trans = {
     'United States of America' : 'United States',
     'Democratic Republic of the Congo' : 'DR Congo',
     'United Republic of Tanzania' : 'Tanzania',
     'Czechia' : 'Czech Republic',
     'Republic of Serbia' : 'Serbia',
     'Hong Kong S.A.R.' : 'Hong Kong',
     'Republic of the Congo' : 'Congo',
     'Palestine' : 'State of Palestine',
     'Macedonia' : 'North Macedonia',
     'East Timor' : 'Timor-Leste',
     'eSwatini' : 'Eswatini',
     'Macao S.A.R' : 'Macao',
     'The Bahamas' : 'Bahamas',
     'São Tomé and Principe' : 'Sao Tome and Principe',
     'Federated States of Micronesia' : 'Micronesia',
     'United States Virgin Islands' : 'U.S. Virgin Islands',
     'Faroe Islands' : 'Faeroe Islands',
     'Turks and Caicos Islands' : 'Turks and Caicos',
     'Wallis and Futuna' : 'Wallis & Futuna',
     'Saint Pierre and Miquelon' : 'Saint Pierre & Miquelon'}


continent_trans = {
    'Réunion' : 'Europe',
    'Guadeloupe' : 'Europe',
    'Martinique' : 'Europe',
    'French Guiana' : 'Europe',
    'Channel Islands' : 'Europe',
    'Gibraltar' : 'Europe',
    'Caribbean Netherlands' : 'Europe',
    'Mayotte' : 'Europe',
    'Tuvalu' : 'Oceania',
    'Tokelau' : 'Oceania',
    'Holy See' : 'Europe'}

In [49]:
# map countries
countries['ADMIN'].replace(country_trans, inplace = True)

In [50]:
merged = df.merge(countries, left_on = 'country', right_on = 'ADMIN', how = 'outer')
merged[merged['ADMIN'].isnull()][['country']]

Unnamed: 0,country
161,Réunion
175,Guadeloupe
178,Martinique
181,French Guiana
185,Mayotte
189,Channel Islands
220,Caribbean Netherlands
233,Tokelau
234,Holy See


In [54]:
# dataframe with the final country - continent mapping
continent_map = pd.DataFrame.from_dict(continent_trans, orient = 'index').reset_index()
continent_map.columns = ['ADMIN', 'CONTINENT']
continent_map

Unnamed: 0,ADMIN,CONTINENT
0,Réunion,Europe
1,Guadeloupe,Europe
2,Martinique,Europe
3,French Guiana,Europe
4,Channel Islands,Europe
5,Gibraltar,Europe
6,Caribbean Netherlands,Europe
7,Mayotte,Europe
8,Tuvalu,Oceania
9,Tokelau,Oceania


In [56]:
# update our countries
countries = countries.append(continent_map)
countries.columns = ['ADMIN','continent']

# check
merged = df.merge(countries, left_on = 'country', right_on = 'ADMIN', how = 'outer')
merged[merged['ADMIN'].isnull()][['country']]

Unnamed: 0,country


In [58]:
# final merge
df2 = df.merge(countries, left_on = 'country', right_on = 'ADMIN', how = 'left')
df2.head()

Unnamed: 0,rank,country,population,yearly_change_pct,net_change,density,land_are,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct,scaled_pop,ADMIN,continent
0,1,China,1439323776,0.39,5540090,153,9388211,-348399.0,1.7,38,61,18.47,1439.32,China,Asia
1,2,India,1380004385,0.99,13586631,464,2973190,-532687.0,2.2,28,35,17.7,1380.0,India,Asia
2,3,United States,331002651,0.59,1937734,36,9147420,954806.0,1.8,38,83,4.25,331.0,United States,North America
3,4,Indonesia,273523615,1.07,2898047,151,1811570,-98955.0,2.3,30,56,3.51,273.52,Indonesia,Asia
4,5,Pakistan,220892340,2.0,4327022,287,770880,-233379.0,3.6,23,35,2.83,220.89,Pakistan,Asia


In [59]:
df2[df2['continent'].isnull()]

Unnamed: 0,rank,country,population,yearly_change_pct,net_change,density,land_are,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct,scaled_pop,ADMIN,continent


## 3. Update Table
***

In [62]:
# update table with new column
query = """
    ALTER TABLE populations
    ADD continent varchar"""

qdb.execute_query(query)

unable to execute query
---
(sqlite3.OperationalError) duplicate column name: continent
[SQL: 
    ALTER TABLE populations
    ADD continent varchar]
(Background on this error at: http://sqlalche.me/e/e3q8)
---


In [73]:
# insert into this column
for i in range(len(df2)):
    query = """
        UPDATE populations
        SET continent = '{}'
        WHERE country = '{}'
        """.format(df2['continent'][i], df2['country'][i])

    qdb.execute_query(query)

unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This result object does not return rows. It has been closed automatically.
---
unable to execute query
---
This resu

In [75]:
q = """SELECT * 
       FROM populations 
       WHERE continent IS NULL LIMIT 5"""
qdb.execute_query(q)

0 rows affected


Unnamed: 0,rank,country,population,yearly_change_pct,net_change,density,land_are,migrants,fert_rate,med_age,urban_pop_pct,world_share_pct,scaled_pop,continent


In [76]:
# DONE!