# Communities Data Exploration


Explore, filter and work with Settle Extents data from GRID3, UN COD-PP data for place names, HOT OSM populated places, etc

In [1]:
import pandas as pd
import geopandas as gpd
import plotly.express as px

from tqdm.notebook import tqdm
from geopy.distance import distance, lonlat

## Load and Explore Settlement Extents Communities (GRID3)

In [3]:
# Settlement Extents file
se_file = "./data/Ghana_Settlement_Extents_Version_01.01.geojson"

In [4]:
# Load source data
se_df = gpd.read_file(se_file, iterator=True, chunksize=1000, index="OBJECTID")
se_df.shape

(374257, 16)

In [5]:
# Report number of settlements per region
se_df["adm1_name"].value_counts()

adm1_name
Ashanti             53094
Northern            39687
Western North       38713
Western             32932
Eastern             27371
Bono East           25294
Upper West          25062
Upper East          19162
Central             18170
Savannah            17711
Ahafo               17138
Bono                15409
Oti                 14136
Volta               13377
Northern East       12398
Greater Accra        4140
NA                    298
crosses boundary      165
Name: count, dtype: int64

In [6]:
# Regions and Select region
region_pcodes = {
    "Ashanti": "GH02",
    "Eastern": "GH06",
    "Volta": "GH14",
    "Bono": "GH03",
    "Ahafo": "GH01"
}

In [7]:
# Filter to desired regions only
# NB: We are filtering out any communities here that cross region boundaries or have null name (~165 + 115 re: value counts above). 
regs_df = se_df[se_df["adm1_name"].isin(region_pcodes.keys())].copy()
regs_df = regs_df.set_index("OBJECTID")
regs_df.shape

(126389, 15)

In [8]:
# Sub-filter to desired population range
filt_df = regs_df[regs_df["pop_un_adj"].between(1000, 200000, inclusive="both")].copy()
filt_df.shape

(1039, 15)

In [49]:
# Ignore warnings - visually analyzing centers shows they are fine. NB: Not always inside settlement shapes. Alternatively can use 'GeoSeries.representative_point'
filt_df["approx_center_lat"] = filt_df["geometry"].centroid.y
filt_df["approx_center_lon"] = filt_df["geometry"].centroid.x


  filt_df["approx_center_lat"] = filt_df["geometry"].centroid.y

  filt_df["approx_center_lon"] = filt_df["geometry"].centroid.x


In [9]:
# Examine populations; determine population bounds
# f = px.histogram(filt_df, x="pop_un_adj")
# f.show("browser")

In [13]:
# Distribution of regional settlements
filt_df["adm1_name"].value_counts()

adm1_name
Eastern    332
Ashanti    322
Volta      225
Bono       119
Ahafo       41
Name: count, dtype: int64

In [14]:
# Export
# filt_df.to_file("./data/AFPW-Communities-AllRegions.10k-200k.geojson")

In [50]:
# Check out
filt_df.head(1)

Unnamed: 0_level_0,mgrs_code,country,iso,type,population,pop_un_adj,adm0_pcode,adm1_name,adm1_pcode,adm2_name,adm2_pcode,settl_pcode,Shape__Area,Shape__Length,geometry,Communities,NumCommunities,CommunityNames,approx_center_lat,approx_center_lon
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
47262,30NXM4069_02,Ghana,GHA,Small Settlement Area,1741.494917,1637.835636,GH,Ashanti,GH02,Adansi Akrofuom,GH0201,GH0201000022,4.5e-05,0.031784,"POLYGON ((-1.73039 6.05879, -1.73003 6.05862, ...","[{'community': 'Amponyase', 'source': 'HOT-OSM...",1.0,Amponyase,6.055197,-1.733869


In [98]:
# Explore (Tiles: ["OpenStreetMap",  "CartoDB positron", “CartoDB dark_matter"]
# fig = filt_df.explore(column="pop_un_adj", cmap="Reds_r")
# fig

## Open and explore COD-PP UN OCHA data

In [17]:
# Populated Places stats file
pp_file = "./data/UN OCHA - COD-PP - gha_ppl_1m_nga.zip"

In [18]:
# Load populated places data
pp_df = gpd.read_file(pp_file)
pp_df = pp_df[pp_df["ADM1_EN"].isin(region_pcodes.keys())].copy()
pp_df.shape

(4654, 18)

In [19]:
pp_df.head(2)

Unnamed: 0,LAT,LONG,CNTRY_CODE,NAME,REFNAME,Pcod,popPlaceCI,popPlace1,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn,validTo,geometry
2310,5.716667,-0.75,GHA,Ahyiam,Eshiem,GH06070001,0.0,Other,Asene Akroso Manso,GH0607,Eastern,GH06,Ghana,GH,2018-10-26,2021-03-08,,POINT (-0.75000 5.71667)
2311,5.716667,-0.75,GHA,Bantama,,GH06070002,0.0,Other,Asene Akroso Manso,GH0607,Eastern,GH06,Ghana,GH,2018-10-26,2021-03-08,,POINT (-0.75000 5.71667)


In [97]:
# Explore
# pp_df.explore(m=fig)

## Load and Explore HOT-OSM ghana populated places data

In [21]:
hotosm_points_file = "./data/hotosm_gha_populated_places_points_shp.zip"

In [22]:
osm_df = gpd.read_file(hotosm_points_file)
osm_df = osm_df.set_crs(crs="EPSG:4326")
osm_df.shape

(9593, 9)

In [81]:
# Drop points without names
osm_df = osm_df[osm_df["name"].notna()].copy()
osm_df.shape

(6501, 9)

In [82]:
osm_df.head(2)

Unnamed: 0,name,name_en,place,population,is_in,source,osm_id,osm_type,geometry
0,Koforidua,,city,130000.0,,,892162451,nodes,POINT (-0.26146 6.10033)
1,Pusiga,,town,,,,2330379909,nodes,POINT (-0.12769 11.08540)


In [95]:
# Explore
# osm_df.explore(m=fig, color="lime")

## Explore mapping communities to settlement extents - multi-labelling

In [87]:
filt_df.shape, pp_df.shape, osm_df.shape

((1039, 20), (4654, 18), (6501, 9))

In [88]:
# Direct containment
for s in tqdm(filt_df.itertuples()):
    contains = []
    
    for p in pp_df.itertuples():
        if s.geometry.contains(p.geometry):
            contains.append({"community": p.NAME, "source": "COD-PP", "id": p.Pcod, "id_field": "Pcod"})
    for o in osm_df.itertuples():
        if s.geometry.contains(o.geometry):
            contains.append({"community": o.name, "source": "HOT-OSM", "id": o.osm_id, "id_field": "osm_id"})
    
    filt_df.at[s.Index, "Communities"] = contains
    filt_df.at[s.Index, "NumCommunities"] = len(contains)
    filt_df.at[s.Index, "CommunityNames"] = " / ".join(c["community"] for c in contains)
    
    # if not contains:
    #     print(f"Warning: No community annotation for SE:\t{s.mgrs_code}")        

0it [00:00, ?it/s]

In [89]:
(filt_df["NumCommunities"] < 1).sum()

143

In [90]:
# Add data based on distance thresholds
dist_thresh_km = 10.0

for s in tqdm(filt_df.itertuples()):
    if s.NumCommunities > 0:
        continue
    
    min_dist = dist_thresh_km
    min_dist_data = None
    
    for p in pp_df.itertuples():
        d = distance((s.approx_center_lat, s.approx_center_lon), (p.LAT, p.LONG))
        if d < min_dist:
            min_dist = d
            min_dist_data = {"community": p.NAME, "source": "COD-PP", "id": p.Pcod, "id_field": "Pcod"}
            
    for o in osm_df.itertuples():
        d = distance((s.approx_center_lat, s.approx_center_lon), (o.geometry.coords[0][1], o.geometry.coords[0][0]))
        if d < min_dist:
            min_dist = d
            min_dist_data = {"community": o.name, "source": "HOT-OSM", "id": o.osm_id, "id_field": "osm_id"}

    contains = [] if min_dist_data is None else [min_dist_data]
    filt_df.at[s.Index, "Communities"] = contains
    filt_df.at[s.Index, "NumCommunities"] = len(contains)
    filt_df.at[s.Index, "CommunityNames"] = " / ".join(c["community"] for c in contains)        

0it [00:00, ?it/s]

In [91]:
(filt_df["NumCommunities"] < 1).sum()

4

In [93]:
filt_df.head(3)

Unnamed: 0_level_0,mgrs_code,country,iso,type,population,pop_un_adj,adm0_pcode,adm1_name,adm1_pcode,adm2_name,adm2_pcode,settl_pcode,Shape__Area,Shape__Length,geometry,Communities,NumCommunities,CommunityNames,approx_center_lat,approx_center_lon
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
47262,30NXM4069_02,Ghana,GHA,Small Settlement Area,1741.494917,1637.835636,GH,Ashanti,GH02,Adansi Akrofuom,GH0201,GH0201000022,4.5e-05,0.031784,"POLYGON ((-1.73039 6.05879, -1.73003 6.05862, ...","[{'community': 'Amponyase', 'source': 'HOT-OSM...",1.0,Amponyase,6.055197,-1.733869
47264,30NXM5370_02,Ghana,GHA,Small Settlement Area,1139.194799,1071.38633,GH,Ashanti,GH02,Adansi Akrofuom,GH0201,GH0201000024,3e-05,0.026961,"POLYGON ((-1.61655 6.06406, -1.61575 6.06404, ...","[{'community': 'Mensakrom', 'source': 'COD-PP'...",1.0,Mensakrom,6.060373,-1.616806
47267,30NXM4472_01,Ghana,GHA,Small Settlement Area,1740.169384,1636.589024,GH,Ashanti,GH02,Adansi Akrofuom,GH0201,GH0201000027,4.5e-05,0.04321,"POLYGON ((-1.69786 6.09014, -1.69773 6.09010, ...","[{'community': 'Wuamasi', 'source': 'HOT-OSM',...",1.0,Wuamasi,6.08612,-1.698204


In [99]:
filt_df["NumCommunities"].value_counts()

NumCommunities
1.0     681
2.0     197
3.0      72
4.0      32
5.0      15
7.0       9
6.0       7
8.0       4
0.0       4
11.0      3
10.0      2
15.0      2
14.0      1
17.0      1
29.0      1
9.0       1
13.0      1
18.0      1
30.0      1
27.0      1
16.0      1
12.0      1
21.0      1
Name: count, dtype: int64

In [101]:
filt_df[filt_df["NumCommunities"] < 1].explore(color="magenta")