In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import os
from folium.plugins import MarkerCluster
from math import radians, sin, cos, acos

In [62]:
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

In [63]:
# data import
dataDir = r'C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\SLE\Cleaned Data\Overall'

filename = 'UNICEF DPPI 2016_cleaned_0814.csv'
unicef = pd.read_csv(dataDir+'\\'+filename)

filename = 'sara_cleaned_0814.csv'
sara = pd.read_csv(dataDir+'\\'+filename)

filename = 'ISS_cleaned_0814.csv'
iss = pd.read_csv(dataDir+'\\'+filename)

In [64]:
# find the union of columns in each dataset
cols = set(unicef.columns).union(set(sara.columns))
cols = cols.union(set(iss.columns))

In [65]:
# if a column doesn't exist, create that column with all values being NA
for column in cols:
    if column not in unicef.columns:
        unicef[column] = np.nan

In [66]:
# if a column doesn't exist, create that column with all values being NA
for column in cols:
    if column not in sara.columns:
        sara[column] = np.nan

In [67]:
# if a column doesn't exist, create that column with all values being NA
for column in cols:
    if column not in iss.columns:
        iss[column] = np.nan

In [68]:
# combine datasets
combined = pd.concat([unicef, sara, iss])

In [71]:
# write csv file
dataDir = r'C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\SLE\Cleaned Data'
combined.to_csv(dataDir + '\\combined_list_full_0814.csv',
               index=False)

In [72]:
# drop points with no coordinates
combined_dropped = combined.dropna(subset=["Longitude", 'Latitude'])
combined_dropped.to_csv(dataDir + '\\combined_list_geo_0814.csv',
               index=False)

In [73]:
print("Number of points in combined list:", combined.shape[0])

Number of points in combined list: 4097


In [74]:
print("Number of points in combined list with no missing coordinates:", combined_dropped.shape[0])

Number of points in combined list with no missing coordinates: 3943


In [75]:
# add feature sequence
seq = gpd.read_file(r'C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\SLE\SLE.gdb', 
                     driver='FileGDB', layer= 'combined_FindIdentical')
combined = gpd.read_file(r'C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\SLE\SLE.gdb', 
                     driver='FileGDB', layer= 'combined')

In [76]:
combined.reset_index(inplace=True)

combined['index'] = combined['index']+1

combined.rename(columns={'index':'IN_FID'}, inplace=True)

df = combined.merge(seq.drop(columns=['geometry']), on = 'IN_FID')
df.drop(columns=['IN_FID'], inplace=True)

In [77]:
# sort the dataframe
df.sort_values(by=['FEAT_SEQ', 'District', 'Chiefdom', 'Facility_Name',
                   'Facility_Type', 'Source'], inplace=True)

In [83]:
df.to_csv(dataDir + '\\combined_list_0814.csv',
          index=False)

In [82]:
df[df['FEAT_SEQ']==501].head(3)

Unnamed: 0,Region,District,Chiefdom,Facility_Name,ID,Facility_Type,Latitude,Longitude,Altitude,GPS_accuracy,Functional_Status,Catchment_population,Number_of_villages_within_the_catchment_area,Mobile_Network_coverage,Short_Name,Source,Location_of_facility,facility_fp_acs_copy,cross_referenced,supervised_by_dsno,Accomodation_Provided,Urban_rural,Capacity_of_accommodation,Date,Waypoint_name,surveillance_fp_training,BeMOC_,priority_level,physiotherapy_unit,Number_of_housing_units,case_def_copy_copy1,Category,current_feed_back,Data_Source,Outpatient_only,Managing_authority,designation,supervised_by_who,facility_fp_acs,joint_supervision,geometry,FEAT_SEQ
3089,Northern,Koinadugu,Wara Wara Yagala,KABALA GOVERNMENT HOSPITAL,478.0,GENERAL_HOSP,9.586599,-11.548426,,,,,,,Kabala,ISS,,N,,1_MONTH,,,,2020-01-28,,1_2_YRS,,H,N,,AFP MEASLES YELLOW_FEVER CEREBROSPINAL_MENINGI...,,Y,,,,STOP_TEAM,1_MONTH,Y,Y,POINT (-11.54843 9.58660),501
3090,Northern,Koinadugu,Wara Wara Yagala,KABALA GOVERNMENTS HOSPITAL,473.0,GENERAL_HOSP,9.586608,-11.54838,,,,,,,Kabala s,ISS,,N,,NOT_IN_THE_PAST_1_YEAR,,,,2019-10-11,,MORE_THAN_2_YRS,,H,N,,AFP MEASLES YELLOW_FEVER CEREBROSPINAL_MENINGI...,,Y,,,,STOP_TEAM,1_MONTH,Y,Y,POINT (-11.54838 9.58661),501
3091,Northern,Koinadugu,Wara Wara Yagala,KABALA GOVT. HOSP MCH STATIC,57.0,GENERAL_HOSP,9.586994,-11.547792,,,,,,,Kabala Mch Static,ISS,,,,MORE_THAN_6_MONTHS,,,,2018-11-17,,1_2_YRS,,H,N,,AFP MEASLES YELLOW_FEVER NNT,,Y,,,,WCO_SURVEILLANCE_OFFICER,NOT_IN_THE_PAST_1_YEAR,Y,Y,POINT (-11.54779 9.58699),501


In [78]:
df.groupby('FEAT_SEQ').agg(count=('ID','count'))\
.sort_values('count', ascending=False).head(20)

Unnamed: 0_level_0,count
FEAT_SEQ,Unnamed: 1_level_1
1150,39
206,13
138,12
501,12
883,12
634,11
144,11
383,10
530,8
956,8


Findings:

- 39 points in SARA dataset with the same feature sequence
Almost identical coordinates.

In [79]:
counts = df.groupby('FEAT_SEQ').agg(count=('ID','count'))\
.sort_values('count', ascending=False)