# EDA GPW Data
---
Glenn Abastillas | April 9, 2020

In [88]:
import pandas as pd
import geopandas as gpd
from glob import glob as G
import numpy as np

### Load Data

In [97]:
data = {
    'regions' : gpd.read_file(G('../data/maps/regions/*.shp')[0])['ADM1_NAME ADM2_NAME geometry'.split()],
    'demographics' : pd.read_csv(G('../data/datasets/demographics/*.csv')[0])
}

---
### Inspect Regions Data for Completeness

Check for the same number of regions and sectors and make sure regions and sectors match.

In [98]:
admin1 = data['regions'].ADM1_NAME
admin2 = data['regions'].ADM2_NAME

number_of_regions = admin1.unique().size
number_of_sectors = admin2.unique().size

print(f'No. Regions: {number_of_regions}\nNo. Sectors: {number_of_sectors}')

No. Regions: 9
No. Sectors: 39


In [99]:
name1 = data['demographics'].NAME1
name2 = data['demographics'].NAME2

number_of_regions = name1.unique().size
number_of_sectors = name2.unique().size

print(f'No. Regions: {number_of_regions}\nNo. Sectors: {number_of_sectors}')

No. Regions: 9
No. Sectors: 39


In [100]:
missing_from_admin1 = admin1[~admin1.isin(name1)]
missing_from_admin2 = admin2[~admin2.isin(name2)]

missing_from_name1 = name1[~name1.isin(admin1)]
missing_from_name2 = name2[~name2.isin(admin2)]

admin_names = np.append(missing_from_admin1.values, missing_from_admin2.values)
name_names = np.append(missing_from_name1.values, missing_from_name2.values)

admin_names, name_names

(array([], dtype=object), array([], dtype=object))

Create mapping for names

In [101]:
name_map = [('Bissau', 'Aut. Bissau'), 
            ('Sector Autonomo de Bissau', 'Sector 1'),
            ('Cacheu/Calequisse', 'Cacheu'),
            ('Mansaba', 'Masaba'),
            ('Gamamudo/Ganadu', 'Ganado'),
            ('Galomaro/Cosse', 'Cosse')
           ]

to_admin = {b : a for a, b in name_map}
to_demog = {a : b for a, b in name_map}

Rename data values according to `regions`

In [102]:
for name in 'NAME1 NAME2'.split():
    data['demographics'][name].replace(to_demog, inplace=True)
    
for name in 'ADM1_NAME ADM2_NAME'.split():
    data['regions'][name].replace(to_demog, inplace=True)

Save updated data.

In [96]:
data['regions'].to_file(G('../data/maps/regions/*.shp')[0])

---
END