In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import os
from folium.plugins import MarkerCluster
from math import radians, sin, cos, acos

In [2]:
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

In [3]:
cols = ['id', 'admin1', 'admin2', 'name_full', 'name_short', 'type_original', 'type_cleaned', 
        'latitude', 'longitude', 'source']

In [4]:
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Cleaned data\MOZ"

# Examine the new unique points

New unique points are data points that are more than 100 meters away from PLM data points.

## Adding new ISS points

In [61]:
dataDir = r'C:\\Users\\DUANYUEYUN\\Documents\\ArcGIS\\Projects\\MOZ-cleaning_0729\\MOZ-cleaning_0729.gdb'

In [16]:
iss_new = gpd.read_file(dataDir, driver='FileGDB', 
                   layer= 'ISS_new')
plm = pd.read_csv(saveDir + '\PLM_cleaned_0729.csv')
# Uppercase first letter in admin1
plm['admin1'] = plm['admin1'].str.title()
# Remove whitespace in admin2
plm['admin2'] = plm['admin2'].str.strip()

In [18]:
print("Number of PLM points:", plm.shape[0])
print("Number of new ISS points:", iss_new.shape[0])

Number of PLM points: 1712
Number of new ISS points: 34


In [19]:
# combine PLM and ISS
df = pd.concat([iss_new[cols], plm[cols]], axis=0)

Consider rows with identical values in `admin1`, `admin2`, `name_short` and `type_cleaned` as duplicates.

In [22]:
duplicates = df[df.duplicated(subset=['admin1', 'admin2', 'name_short', 'type_cleaned'],
                keep=False)] \
.sort_values(['admin1', 'admin2', 'name_short', 'type_cleaned', 'source'])

In [24]:
print("Number of duplicates identified for now:",
     duplicates[duplicates.duplicated(subset=['admin1', 'admin2', 'name_short', 'type_cleaned'])] \
.sort_values(['admin1', 'admin2', 'name_short', 'type_cleaned']).shape[0])

Number of duplicates identified for now: 29


If the pair of duplicate points are within 1km from each other, consider them as duplicates. Otherwise they are considered distinct health facilities.

In [26]:
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    dist = 6371.01 * acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2)*cos(lon1 - lon2))
    return dist

In [27]:
distances = []
same_sources = []
dups = []
for i in range(0, 58, 2):
    row1 = duplicates.iloc[i,:]
    row2 = duplicates.iloc[i+1,:]
    if row1['source'] == row2['source']:
        same_sources.append(True)
        same_sources.append(True)
    else:
        same_sources.append(False)
        same_sources.append(False)
    
    p1 = (row1['longitude'], row1['latitude'])
    p2 = (row2['longitude'], row2['latitude'])
    dist = distance(p1,p2)
    distances.append(dist)
    distances.append(dist)
    
    if dist < 1:
        dups.append(True)
        dups.append(False)
    else:
        dups.append(False)
        dups.append(False)

In [28]:
duplicates['distance'] = distances
duplicates['same_source'] = same_sources
duplicates['duplicate'] = dups

In [29]:
duplicates

Unnamed: 0,id,admin1,admin2,name_full,name_short,type_original,type_cleaned,latitude,longitude,source,distance,same_source,duplicate
34,HF35,Cabo Delgado,Cidade De Pemba,CS Mahate,Mahate,CS,Centro de Saúde,-13.019118,40.53097,PLM,0.0,True,True
35,HF36,Cabo Delgado,Cidade De Pemba,CS Mahate,Mahate,CS,Centro de Saúde,-13.019118,40.53097,PLM,0.0,True,False
0,64044,Cabo Delgado,Metuge,CS MIEZE,Mieze,HEALTH_FACILITY,Centro de Saúde,-13.10724,40.453321,ISS,19.164728,False,False
62,HF63,Cabo Delgado,Metuge,CS Mieze,Mieze,CS,Centro de Saúde,-13.0592,40.2849,PLM,19.164728,False,False
1,62742,Cabo Delgado,Mocimboa Da Praia,CS NANDUADUA,Nanduadua,HEALTH_FACILITY,Centro de Saúde,-11.349539,40.351586,ISS,0.856147,False,True
70,HF71,Cabo Delgado,Mocimboa Da Praia,CS Nanduadua,Nanduadua,CS,Centro de Saúde,-11.350993,40.343966,PLM,0.856147,False,False
257,HF258,Gaza,Xai-Xai Distrito,CS Banhine,Banhine,CS,Centro de Saúde,-24.80964,33.37037,PLM,34.288372,True,False
258,HF259,Gaza,Xai-Xai Distrito,CS Banhine,Banhine,CS,Centro de Saúde,-25.1525,33.4853,PLM,34.288372,True,False
304,HF305,Inhambane,Homoine,CS Mafuiane,Mafuiane,CS,Centro de Saúde,-24.01252,35.163382,PLM,0.006533,True,True
305,HF306,Inhambane,Homoine,CS Mafuiane,Mafuiane,CS,Centro de Saúde,-24.01246,35.16335,PLM,0.006533,True,False


In [30]:
print("Number of duplicates to be dropped:", duplicates[duplicates['duplicate']].shape[0])

Number of duplicates to be dropped: 13


In [32]:
# Drop duplicates
plm_iss = pd.concat([df[~df.duplicated(subset=['admin1', 'admin2',
                          'name_short', 'type_cleaned'],
                keep=False)], duplicates[~duplicates['duplicate']][cols]],
          axis = 0)

In [34]:
print("Number of new ISS points added:",
     plm_iss[plm_iss['source']=='ISS'].shape[0])

Number of new ISS points added: 23


In [35]:
#plm_iss.to_csv(saveDir + '\plm_iss_0729.csv')

## Adding new WHO points

In [213]:
plm_iss = pd.read_csv(saveDir + '\plm_iss_0729.csv')

In [36]:
who_new = gpd.read_file(dataDir, driver='FileGDB', 
                   layer='WHO')
who_new = who_new[cols]
who_new['admin1'] = who_new['admin1'].str.replace('Zambézia', 'Zambezia')

In [62]:
print("Number of new WHO points:", who_new.shape[0])

Number of new WHO points: 726


In [38]:
# combine dataframes
df = pd.concat([plm_iss,who_new], axis=0)

In [40]:
df.head()

Unnamed: 0,id,admin1,admin2,name_full,name_short,type_original,type_cleaned,latitude,longitude,source
9,63664,Nampula,Distrito De Nampula,CS 25 DE SETEMBRO,25 De Setembro,HEALTH_FACILITY,Centro de Saúde,-15.112227,39.253085,ISS
11,63643,Nampula,Rapale,CS RAPALE,Rapale,HEALTH_FACILITY,Centro de Saúde,-15.02534,39.122963,ISS
12,63660,Sofala,Buzi,CS GUARA-GUARA,Guara-Guara,HEALTH_FACILITY,Centro de Saúde,-19.884094,34.593518,ISS
14,63659,Sofala,Nhamatanda,CS NHARCHONGA,Nharchonga,HEALTH_FACILITY,Centro de Saúde,-19.262732,34.193562,ISS
15,63291,Tete,Marara,CS MUFA CACONDE,Mufa Caconde,HEALTH_FACILITY,Centro de Saúde,-16.158024,33.261667,ISS


Since WHO dataset does not have information on `admin2`, consider rows with identical values in `admin1`, `name_short` and `type_cleaned` as duplicates.

In [41]:
duplicates = df[df.duplicated(subset=['admin1', 'name_short', 'type_cleaned'],
                keep=False)] \
.sort_values(['admin1', 'name_short', 'type_cleaned', 'source'])

In [42]:
duplicates.head(6)

Unnamed: 0,id,admin1,admin2,name_full,name_short,type_original,type_cleaned,latitude,longitude,source
129,HF130,Cabo Delgado,Quissanga,PS Bilibiza,Bilibiza,PS,Posto de Saúde,-12.5639,40.2853,PLM
0,49852,Cabo Delgado,,Bilibiza Posto de Saúde,Bilibiza,Posto de Saúde,Posto de Saúde,-13.5795,39.7356,WHO
33,HF34,Cabo Delgado,Cidade De Pemba,CS Chuiba,Chuiba,CS,Centro de Saúde,-13.0597,40.525,PLM
1,49862,Cabo Delgado,,Chuiba Centro de Saúde Urbano C,Chuiba,Centro de Saúde Urbano C,Centro de Saúde,-13.0225,40.5656,WHO
56,HF57,Cabo Delgado,Meluco,CS Imbada,Imbada,CS,Centro de Saúde,-12.543,39.644,PLM
2,49871,Cabo Delgado,,Imbada Centro de Saúde Rural II,Imbada,Centro de Saúde Rural II,Centro de Saúde,-12.3964,40.1075,WHO


In [44]:
uniques = duplicates[~duplicates.duplicated(subset=['admin1', 'name_short', 'type_cleaned'])]\
.sort_values(['admin1', 'name_short', 'type_cleaned'])

In [63]:
print("Number of duplicates identified for now:", uniques.shape[0])

Number of duplicates identified for now: 442


In [64]:
distances = []
dups = []
for idx, row in uniques.iterrows():
    test_df = duplicates[(duplicates['admin1'] == row['admin1']) & \
              (duplicates['name_short'] == row['name_short']) & \
              (duplicates['type_cleaned'] == row['type_cleaned'])]
    
    num = test_df.shape[0]
    
    if num == 2:
        row1 = test_df.iloc[0,:]
        row2 = test_df.iloc[1,:]
        p1 = (row1['longitude'], row1['latitude'])
        p2 = (row2['longitude'], row2['latitude'])
        dist = distance(p1,p2)
        distances.append(dist)
        distances.append(dist)
        
    if num == 3:
        row1 = test_df.iloc[0,:]
        row2 = test_df.iloc[1,:]
        row3 = test_df.iloc[2,:]
        p1 = (row1['longitude'], row1['latitude'])
        p2 = (row2['longitude'], row2['latitude'])
        p3 = (row3['longitude'], row3['latitude'])
        d12 = distance(p1, p2)
        d13 = distance(p1, p3)
        d23 = distance(p2, p3)
        distances.append(min(d12,d13))
        distances.append(min(d12,d23))
        distances.append(min(d23,d13))
duplicates['distance'] = distances

In [65]:
who_dups = duplicates[(duplicates['distance']<1) & (duplicates['source']=='WHO')]

In [66]:
print("Number of duplicates to be dropped:",
     who_dups.shape[0])

Number of duplicates to be dropped: 112


In [51]:
duplicates2 = pd.concat([duplicates,who_dups], axis=0)\
.drop_duplicates(keep=False)

In [53]:
final_df = pd.concat([df[~df.duplicated(subset=['admin1', 'name_short', 'type_cleaned'],
                keep=False)],duplicates2],axis=0)

In [60]:
#final_df.to_csv(saveDir + '/mfl_0729.csv')

In [71]:
print("Number of new data points added:", final_df[(final_df['source']=='WHO')| \
        (final_df['source']=='ISS')].shape[0])
print("Number of data points in total:", final_df.shape[0])

Number of new data points added: 637
Number of data points in total: 2347


In [58]:
# province breakdown of new points
final_df[(final_df['source']=='WHO')| \
        (final_df['source']=='ISS')]['admin1'].value_counts()

Sofala              182
Zambezia             98
Nampula              80
Niassa               48
Tete                 47
Maputo Provincia     45
Inhambane            42
Manica               30
Gaza                 30
Cabo Delgado         30
Maputo Cidade         5
Name: admin1, dtype: int64

In [59]:
# facility type breakdown of new points
final_df[(final_df['source']=='WHO')| \
        (final_df['source']=='ISS')]['type_cleaned'].value_counts()

Centro de Saúde        456
Posto de Saúde         163
Hospital Distrital       5
Hospital Rurais          5
Hospital Provincial      3
Hospital Central         2
Hospital Geral           2
Other                    1
Name: type_cleaned, dtype: int64

In [73]:
df = final_df[cols]
df.columns

Index(['id', 'admin1', 'admin2', 'name_full', 'name_short', 'type_original',
       'type_cleaned', 'latitude', 'longitude', 'source'],
      dtype='object')

In [74]:
#df.to_csv(saveDir + '/mfl_0729.csv')

In [75]:
df.shape

(2347, 10)

In [76]:
duplicates = df[df.duplicated(subset=['admin1', 'admin2', 'name_short', 'type_cleaned'],
                keep=False)] \
.sort_values(['admin1', 'admin2', 'name_short', 'type_cleaned', 'source'])

In [77]:
duplicates.shape

(36, 10)