In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import os
from folium.plugins import MarkerCluster
from math import radians, sin, cos, acos

In [2]:
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

In [3]:
# data import
dataDir = r'C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\SLE'
filename = 'Consolidated Facility List_ UNICEF DPPI 2016_deidentified.xlsx'
unicef = pd.read_excel(dataDir+'\\Original Data\\'+filename, sheet_name = 2,
                       usecols = 'B:O', na_values = ['#N/A', 'Missing', 'Not available', 
                                                     'Not yet determined', "No Data"])

dtsi = gpd.read_file(dataDir + '\\Original Data\\' + 'SLE_Health_facilities.gdb', 
                     driver='FileGDB', layer= 'SLE_DTSI_health_facilities')

filename = 'Draft Master Facility List _ SARA WHO DPPI 20170706.xls'
sara = pd.read_excel(dataDir+'\\Original Data\\'+filename, sheet_name = 0,
                    na_values=['Unknown', 'survey error'])

filename = 'Updated list of facilities _ DPPI Mar 2020.xls'
mfl = pd.read_excel(dataDir+'\\Original Data\\'+filename)

filename = 'FMFL with GPS Coordinates.csv'
fmfl = pd.read_csv(dataDir+'\\Original Data\\'+filename)

In [4]:
type_dict = {'unicef':'FACILITY_TYPE', 'sara':'Facility Type', 'mfl':'Facility Type',
            'fmfl':'Facility Type'}
name_dict = {'unicef':'FACILITY_NAME', 'sara':'Facility name (Cleaned)', 
             'mfl':'Facility Name', 'fmfl':'FacilityName'}

# Consolidated Facility List_ UNICEF DPPI 2016_deidentified

## 1. List of fields

In [5]:
print("The fields are", unicef.columns)

The fields are Index(['Region', 'District', 'Chiefdom', 'FACILITY_NAME', 'FACILITY_ID',
       'FACILITY_TYPE', 'Latitude', 'Longitude', 'Altitude', 'GPS accuracy',
       'Functional Status', 'Catchment population',
       'Number of villages within the catchment area',
       'Mobile Network coverage'],
      dtype='object')


- `Region`: Eastern, Northern, Southern, Western
- `District`: admin 1 name
- `Chiefdom`: admin 2 name
- `FACILITY_NAME`: facility name
- `FACILITY_ID`: facility id
- `FACILITY_TYPE`: facility type
- `Latitude`
- `Longitude`
- `Altitude`
- `GPS accuracy`: a score that ranges from 3.75 to 15
- `Functional Status`: values include Functioning, Not reporting to DHMT, Don't know, Not Functioning
- `Catchment population`: population within the catchment area
- `Number of villages within the catchment area`
- `Mobile Network coverage`: Yes/No

In [6]:
# Make column entries consistent in terms of upper/lower case
unicef['Mobile Network coverage'] = unicef['Mobile Network coverage'].str.title()

## 2. Missing values

In [7]:
print("NA values by column")
pd.DataFrame({'count':unicef.isna().sum(), 
              'percentage':round(unicef.isna().sum() / unicef.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
Region,0,0.0
District,0,0.0
Chiefdom,12,0.9
FACILITY_NAME,0,0.0
FACILITY_ID,1,0.1
FACILITY_TYPE,2,0.2
Latitude,66,5.1
Longitude,66,5.1
Altitude,66,5.1
GPS accuracy,66,5.1


## 3. Number of data points

In [8]:
print("Number of data points collected:", unicef.shape[0])

Number of data points collected: 1290


## 4. Facility types

Since there are facilities that are assigned the type 'CLINIC' and others assigned the type 'Clinic', uppercase everything to ensure consistency.

In [9]:
# upper case everything
unicef[type_dict['unicef']] = unicef[type_dict['unicef']].str.upper()

In [10]:
print('Dataset: UNICEF')
print( "Number of unique facility types:", unicef[type_dict['unicef']].nunique())
unicef[type_dict['unicef']].value_counts()

Dataset: UNICEF
Number of unique facility types: 5


MCHP        589
CHP         388
CHC         234
HOSPITAL     43
CLINIC       34
Name: FACILITY_TYPE, dtype: int64

## 5. Facility names

### 5.1 How many of the facility names are empty, both null or no text?

In [11]:
# uppercase first letter
# strip whitespaces
unicef[name_dict['unicef']] = unicef[name_dict['unicef']].str.title()\
.str.strip()

In [12]:
print("Number of null values:", unicef[name_dict['unicef']].isna().sum())

Number of null values: 0


In [13]:
print("Number of no text values:", sum(unicef[name_dict['unicef']] == ""))

Number of no text values: 0


### 5.2 Name cleaning

In [14]:
common_types =  ['CHC', 'MCHP', 'CHP', 'Referral Hospital', 'Government Hospital', 
                 'Mission Hospital', 'Mis Hospital', 'Memorial Hospital', 
                  'Police Hospital', 'Community Hospital',
                 'Military Hospital', 'Government',
                 'Industrial Hospital', 'Hospital', 'Community Health Post',
                'Community Health Centre', 'Health Centre', 'Health Post', 
                 'Memorial Clinic', 'Police Clinic', 'Clinic', 'Dental', 'Mental']

In [15]:
cols_show = ['District', 'Chiefdom', 'FACILITY_NAME', 'FACILITY_TYPE']
#unicef[unicef['FACILITY_TYPE']=='HOSPITAL'][cols_show].sample(10)

In [16]:
unicef['name_short'] = unicef[name_dict['unicef']].str.replace('_',' ')\
.str.replace('-',' ')\
.str.replace(',',' ')\
.str.replace('.',' ') \
.str.replace("   ", " ")\
.str.replace("  ", " ")\
.str.replace(' Hosp | Hosp$',' Hospital ', case=False) \
.str.replace(' Gov ',' Government ', case=False) \
.str.replace(' Govt | Govt$',' Government ', case=False) \
.str.replace('underfive|underfives',' Under Five', case=False) \
.str.title() \
.str.replace('|'.join(common_types), '', regex=True, case = False) \
.str.strip()

In [35]:
cols_show = ['District', 'Chiefdom', 'FACILITY_NAME', 'FACILITY_TYPE', 'name_short']
#unicef[unicef['FACILITY_NAME'].str.contains('-')][cols_show].sample(10)

## 6. Drop identical points that overlap with SARA and FMFL

Note: there are two facilities with the same ID 25116151, one of which is an identical data point that overlaps with other datasets. Only the identical point is dropped. 

In [19]:
overlaps = pd.read_csv(dataDir + '\\Identical Points\\overlap_points_0810.csv')

In [20]:
overlaps = overlaps[overlaps['Source']=='Consolidated Facility List']

In [21]:
# drop identical points
unicef = unicef[~((unicef['FACILITY_ID'].isin(list(overlaps['ID']))) & (~pd.isna(unicef['Latitude'])))]

In [22]:
unicef.rename(columns={'FACILITY_NAME':'Facility Name', 
                       'FACILITY_ID':'ID', 'FACILITY_TYPE':'Facility Type',
                       'name_short':'Short Name'}, inplace=True)

In [23]:
unicef['Source'] = 'Consolidated Facility List'

In [24]:
# cleaning other text columns
unicef['District'] = unicef['District'].str.strip()
unicef['Chiefdom'] = unicef['Chiefdom'].str.title().str.strip()

In [25]:
duplicates = unicef.groupby(['Longitude','Latitude']).agg(count=('ID','count'))\
.sort_values('count',ascending=False)
duplicates = duplicates[duplicates['count']>1]

In [26]:
duplicates = unicef.merge(duplicates, right_index=True, left_on=['Longitude','Latitude'])

In [31]:
duplicates.shape

(108, 17)

In [28]:
duplicates.head()

Unnamed: 0,Region,District,Chiefdom,Facility Name,ID,Facility Type,Latitude,Longitude,Altitude,GPS accuracy,Functional Status,Catchment population,Number of villages within the catchment area,Mobile Network coverage,Short Name,Source,count
34,Eastern,Kailahun,Luawa,Ngeihun,11065091.0,CHP,8.201045,-10.647767,176.300003,4.9,Functioning,8387.0,13.0,No,Ngeihun,Consolidated Facility List,3
137,Eastern,Kenema,Lower Bambara,Ngeihun,12084031.0,CHC,8.201045,-10.647767,176.300003,4.9,Functioning,6269.0,,Yes,Ngeihun,Consolidated Facility List,3
1021,Southern,Moyamba,Kagboro,Ngeihun,33076041.0,MCHP,8.201045,-10.647767,176.300003,4.9,Functioning,1701.0,19.0,Yes,Ngeihun,Consolidated Facility List,3
40,Eastern,Kailahun,Luawa,Kpandebu,11066021.0,MCHP,8.246528,-10.516489,323.150009,4.95,Functioning,2399.0,4.0,Yes,Kpandebu,Consolidated Facility List,3
88,Eastern,Kenema,Dama,Kpandebu Chc,12014011.0,CHC,8.246528,-10.516489,323.150009,4.95,Functioning,5469.0,,No,Kpandebu,Consolidated Facility List,3


In [66]:
unicef.head()

Unnamed: 0,Region,District,Chiefdom,Facility Name,ID,Facility Type,Latitude,Longitude,Altitude,GPS accuracy,Functional Status,Catchment population,Number of villages within the catchment area,Mobile Network coverage,Short Name,Source
0,Eastern,Kailahun,Dea,Baiwala,11014011.0,CHC,7.995473,-10.647431,196.599998,4.9,Functioning,5990.0,18.0,No,Baiwala,Consolidated Facility List
1,Eastern,Kailahun,Dea,Ngebana,11015011.0,CHP,7.928242,-10.674175,216.75,5.0,Functioning,2907.0,10.0,Yes,Ngebana,Consolidated Facility List
2,Eastern,Kailahun,Dea,Sienga,11016011.0,MCHP,7.938098,-10.628222,207.300003,4.9,Functioning,2640.0,11.0,No,Sienga,Consolidated Facility List
3,Eastern,Kailahun,Jawie,Services Clinic (Military),11023011.0,CLINIC,,,,,,,,,Services (Military),Consolidated Facility List
4,Eastern,Kailahun,Jawie,Daru,11024011.0,CHC,7.990132,-10.84405,200.699997,5.0,Functioning,17288.0,13.0,Yes,Daru,Consolidated Facility List


In [32]:
unicef.to_csv(dataDir + "\\Cleaned Data\\Overall\\UNICEF DPPI 2016_cleaned_0814.csv",
             index=False)
duplicates.drop(columns='count').to_csv(dataDir + "\\Cleaned Data\\Duplicates\\UNICEF DPPI 2016_duplicates_0814.csv",
             index=False)

In [33]:
unicef_geo = unicef.dropna(subset=['Latitude', 'Longitude'])
unicef_geo.to_csv(dataDir + "\\Cleaned Data\\No missing coordinates\\UNICEF DPPI 2016_geo_0814.csv",
                 index=False)

# Draft Master Facility List _ SARA WHO DPPI 20170706

## 1. List of fields

In [36]:
print("The fields are", sara.columns)

The fields are Index(['Facility name (Cleaned)', 'New Fac name', 'Facility name in SARA',
       'Facility name in payroll verification', 'District', 'Region',
       'Facility Type', 'Location of facility', 'BeMOC?', 'Managing authority',
       'Urban/rural', 'Outpatient only', 'Waypoint name', 'Altitude',
       'Latitude N/S', 'Latitude degrees', 'Longitude E/W',
       'Longitude degrees', 'Accomodation provided for health facility',
       'Capacity of accommodation', 'Number of housing units',
       'Data source \n(If in SARA, used SARA data. If not, used payroll data)',
       'Category', 'cross ref'd in FHCI MFL?'],
      dtype='object')


- Columns concerning facility name:
    - `Facility name (Cleaned)`
    - `New Fac name`
    - `Facility name in SARA`
    - `Facility name in payroll verification`
- `District`: admin1 name
- `Region`: values include South, North, East, Western
- `Facility Type`
- `Location of facility`
- `BeMOC?`: either Yes or NA
- `Managing authority`: values include Government/Public, NGO/Not-For-Profit, Private-For-Profit, Mission/Faith-Based
- `Urban/rural`: values include Urban, Rural
- `Outpatient only`: values include 1 or 2.
- `Waypoint name`: ?
- `Altitude`
- `Latitude N/S`: values include 1 or 2.
- `Latitude degrees`
- `Longitude E/W`: values include 1 or 2.
- `Longitude degrees`
- `Accomodation provided for health facility`: values include Yes; not observed, Yes; observed, No, survey error.
- `Capacity of accommodation`: some values are survey error.
- `Number of housing units`: some values are survey error.
- `Data source \n(If in SARA, used SARA data. If not, used payroll data)`: SARA, Payroll audit, MOHS Aug Validation
- `Category`: how the cleaned facility name is obtained. 7 unique values.
- `cross ref'd in FHCI MFL?`: yes/no

## 2. Missing values

Columns `Capacity of accommodation` and `Number of housing units` have some values being survey error. Those values are considered missing here.

In [37]:
print("NA values by column")
pd.DataFrame({'count':sara.isna().sum(), 
              'percentage':round(sara.isna().sum() / sara.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
Facility name (Cleaned),0,0.0
New Fac name,0,0.0
Facility name in SARA,113,8.2
Facility name in payroll verification,63,4.6
District,0,0.0
Region,0,0.0
Facility Type,11,0.8
Location of facility,1,0.1
BeMOC?,1306,95.1
Managing authority,14,1.0


## 3. Number of data points

In [38]:
print("Number of data points collected:", sara.shape[0])

Number of data points collected: 1373


## 4. Facility types

In [39]:
# upper case everything
sara[type_dict['sara']] = sara[type_dict['sara']].str.upper()
print('Dataset: SARA')
print( "Number of unique facility types:", sara[type_dict['sara']].nunique())
sara[type_dict['sara']].value_counts()

Dataset: SARA
Number of unique facility types: 7


MCHP                 649
CHP                  344
CHC                  234
CLINIC                70
HOSPITAL              63
OTHER - SPECIALTY      1
OTHER                  1
Name: Facility Type, dtype: int64

## 5. Facility names

### 5.1 How many of the facility names are empty, both null or no text?

In [40]:
sara[name_dict['sara']] = sara[name_dict['sara']].str.strip()
print("Number of null values:", sara[name_dict['sara']].isna().sum())

Number of null values: 0


In [41]:
print("Number of no text values:", sum(sara[name_dict['sara']] == ""))

Number of no text values: 0


### 5.2 Name cleaning

In [84]:
cols_show = ['District', 'Facility name (Cleaned)', 'Facility Type']
#sara[sara['Facility Type']=='Other - Specialty'][cols_show]

In [42]:
sara['Short Name'] = sara[name_dict['sara']].str.replace('_',' ')\
.str.replace('-',' ')\
.str.replace(',',' ')\
.str.replace('.',' ') \
.str.replace("   ", " ")\
.str.replace("  ", " ")\
.str.replace(' Hosp | Hosp$',' Hospital ', case=False) \
.str.replace(' Gov ',' Government ', case=False) \
.str.replace(' Govt | Govt$',' Government ', case=False) \
.str.replace('underfive|underfives',' Under Five', case=False) \
.str.title() \
.str.replace('|'.join(common_types), '', regex=True, case = False) \
.str.strip()

In [45]:
cols_show = ['District', 'Facility name (Cleaned)', 'Facility Type', 'Short Name']
sara[(sara['Facility Type']=='HOSPITAL')|(sara['Facility Type']=='CLINIC')][cols_show].sample(10)

Unnamed: 0,District,Facility name (Cleaned),Facility Type,Short Name
1305,Western Area Urban,Goderich Emergency Surgical Center,HOSPITAL,Goderich Emergency Surgical Center
146,Bombali,Amara Hawa Community Clinic,CLINIC,Amara Hawa Community
692,Koinadugu,SLRC Clinic Sengbeh,CLINIC,Slrc Sengbeh
20,Bo,Dougountoni Hospital,HOSPITAL,Dougountoni
1372,Western Area Urban,Youyi Building Clinic,CLINIC,Youyi Building
259,Bombali,Stocco CHP,CLINIC,Stocco
996,Port Loko,Modia General Clinic,CLINIC,Modia General
697,Koinadugu,Wara Wara Community Faith Clinic,CLINIC,Wara Wara Community Faith
789,Kono,Under Five Clinic Kono,CLINIC,Under Five Kono
227,Bombali,Manonkoh,CLINIC,Manonkoh


## 6. Fix location

In [46]:
sara.reset_index(inplace=True)
sara_cleaned = sara[['Region', 'District', 'index', 'Facility name (Cleaned)', 'Short Name', 
                     'Facility Type', 'Location of facility', 'Latitude degrees', 'Longitude degrees', 'Altitude',
                     'BeMOC?', 'Managing authority', 'Urban/rural', 'Outpatient only', 
                     'Waypoint name', 'Accomodation provided for health facility',
                     'Capacity of accommodation', 'Number of housing units',
                     'Data source \n(If in SARA, used SARA data. If not, used payroll data)',
                     'Category', "cross ref'd in FHCI MFL?"]]

sara_cleaned.rename(columns={'Facility name (Cleaned)':'Facility Name',
                             'Data source \n(If in SARA, used SARA data. If not, used payroll data)':'Data Source',
                             'Accomodation provided for health facility':'Accomodation Provided',
                            'index':'ID', "cross ref'd in FHCI MFL?":'cross referenced'}, inplace=True)

In [47]:
cols_show = ['District', 'Facility name (Cleaned)', 'Facility Type', 'Latitude N/S',
            'Latitude degrees', 'Longitude E/W', 'Longitude degrees']
#sara2[sara2['Latitude N/S']==1][cols_show].sample(10)

In [48]:
min_lat = unicef['Latitude'].min()
max_lat = unicef['Latitude'].max()
min_lon = unicef['Longitude'].min()
max_lon = unicef['Longitude'].max()

In [49]:
lats = []
lons = []
for idx, row in sara_cleaned.iterrows():
    # if both latitude and longitude are NA, consider the coordinates as missing
    if np.isnan(row['Longitude degrees']) and np.isnan(row['Latitude degrees']):
        lats.append(np.nan)
        lons.append(np.nan)
    # if both latitude and longitude are 0, consider the coordinates as missing
    elif row['Longitude degrees'] == 0 and row['Latitude degrees'] == 0:
        lats.append(np.nan)
        lons.append(np.nan)
    # if both latitude and longitude are within the range of those in the unicef dataset,
    # just append the original coordinates
    elif row['Latitude degrees'] > min_lat and row['Latitude degrees'] < max_lat \
    and row['Longitude degrees'] > min_lon and row['Longitude degrees'] < max_lon:
        lats.append(row['Latitude degrees'])
        lons.append(row['Longitude degrees'])
    # negate longitude, if both latitude and longitude are within the range of those in the unicef dataset,
    # append the changed coordinates
    elif row['Latitude degrees'] > min_lat and row['Latitude degrees'] < max_lat \
    and -row['Longitude degrees'] > min_lon and -row['Longitude degrees'] < max_lon:
        lats.append(row['Latitude degrees'])
        lons.append(-row['Longitude degrees'])
    # swap lat and long, if both latitude and longitude are within the range of those in the unicef dataset,
    # append the changed coordinates
    elif row['Latitude degrees'] > min_lon and row['Latitude degrees'] < max_lon \
    and row['Longitude degrees'] > min_lat and row['Longitude degrees'] < max_lat:
        lats.append(row['Longitude degrees'])
        lons.append(row['Latitude degrees'])
    # negate latitude and swap lat and long, if both latitude and longitude are within the range of
    # those in the unicef dataset, append the changed coordinates
    elif -row['Latitude degrees'] > min_lon and -row['Latitude degrees'] < max_lon \
    and row['Longitude degrees'] > min_lat and row['Longitude degrees'] < max_lat:
        lons.append(-row['Latitude degrees'])
        lats.append(row['Longitude degrees'])
    # if none of the above fits, append the original coordinates
    else:
        lats.append(row['Latitude degrees'])
        lons.append(row['Longitude degrees'])

In [50]:
sara_cleaned['Latitude'] = lats
sara_cleaned['Longitude'] = lons
sara_cleaned.drop(columns=['Latitude degrees', 'Longitude degrees'],
                 inplace=True)

In [51]:
wrong_pts = sara_cleaned[(sara_cleaned['Latitude']<min_lat)|
    (sara_cleaned['Longitude']<min_lon)|
    (sara_cleaned['Latitude']>max_lat)|
    (sara_cleaned['Longitude']>max_lon)]

In [52]:
print("Number of points that are still off:", wrong_pts.shape[0])

Number of points that are still off: 16


In [53]:
sara_cleaned['Source'] = 'SARA'

In [54]:
# Region column
region_dict = {'South':'Southern', 'North':'Northern', 
               'East':'Eastern', 'Western':'Western'}
sara_cleaned['Region'] = [region_dict[reg] for reg in sara_cleaned['Region']]

In [56]:
#sara_cleaned[sara_cleaned.duplicated(subset=['Region', 'District', 
#                         'Short Name', 'Facility Type'], keep = False)]

In [57]:
duplicates = sara_cleaned[(sara_cleaned.duplicated(subset=['Longitude','Latitude'], keep = False))\
             &(~pd.isna(sara_cleaned['Longitude']))]\
.sort_values(['Longitude','Latitude'])

In [59]:
sara_cleaned.to_csv(dataDir + "\\Cleaned Data\\Overall\\sara_cleaned_0814.csv",
                   index=False)
duplicates.to_csv(dataDir + "\\Cleaned Data\\Duplicates\\sara_duplicates_0814.csv",
                   index=False)

In [60]:
print("Number of duplicates:", duplicates.shape[0])

Number of duplicates: 61


In [61]:
sara_geo = sara_cleaned.dropna(subset=['Latitude', 'Longitude'])
sara_geo.to_csv(dataDir + "\\Cleaned Data\\No missing coordinates\\sara_geo_0814.csv",
               index=False)

In [62]:
#wrong_pts[['Facility name (Cleaned)', 'District', 'Latitude degrees',
#          'Longitude degrees', 'Latitude', 'Longitude']]

# Updated list of facilities _ DPPI Mar 2020

## 1. List of fields

In [65]:
print("The fields are", mfl.columns)

The fields are Index(['#', 'District', 'Chiefdom', 'Facility Name', 'Facility Type',
       'Council'],
      dtype='object')


- `#`: row number
- `District`: admin 1 name
- `Chiefdom`: admin 2 name
- `Facility Name`
- `Facility Type`
- `Council`

## 2. Missing values

In [66]:
print("NA values by column")
pd.DataFrame({'count':mfl.isna().sum(), 
              'percentage':round(mfl.isna().sum() / mfl.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
#,0,0.0
District,4,0.3
Chiefdom,1,0.1
Facility Name,1,0.1
Facility Type,1,0.1
Council,1,0.1


## 3. Number of data points

In [67]:
print("Number of data points collected:", mfl.shape[0])

Number of data points collected: 1394


## 4. Facility types

In [68]:
# some corrections
mfl[type_dict['mfl']] = mfl[type_dict['mfl']].str.strip()\
.str.replace('CLINIC', 'Clinic') \
.str.replace('CHp', 'CHP')

In [69]:
mfl[type_dict['mfl']] = mfl[type_dict['mfl']].str.upper()

In [70]:
print('Dataset: MFL')
print( "Number of unique facility types:", mfl[type_dict['mfl']].nunique())
mfl[type_dict['mfl']].value_counts()

Dataset: MFL
Number of unique facility types: 12


MCHP                   623
CHP                    412
CHC                    240
CLINIC                  67
HOSPITAL                43
BEMONC - CHC             2
MAMAGEWOH                1
INDUSTRIAL HOSPITAL      1
DUNAMOR                  1
MISSION HOSPITAL         1
REFERRAL HOSPITAL        1
GOVERNMENT HOSPITAL      1
Name: Facility Type, dtype: int64

## 5. Facility names

### 5.1 How many of the facility names are empty, both null or no text?

In [71]:
print("Number of null values:", mfl[name_dict['mfl']].isna().sum())

Number of null values: 1


In [72]:
print("Number of no text values:", sum(mfl[name_dict['mfl']] == ""))

Number of no text values: 0


### 5.2 Name cleaning

In [73]:
mfl['name_short'] = mfl[name_dict['mfl']].str.replace('_',' ')\
.str.replace('-',' ')\
.str.replace(',',' ')\
.str.replace('.',' ') \
.str.replace("   ", " ")\
.str.replace("  ", " ")\
.str.replace(' Hosp | Hosp$',' Hospital ', case=False) \
.str.replace(' Gov ',' Government ', case=False) \
.str.replace(' Govt | Govt$',' Government ', case=False) \
.str.replace('underfive|underfives',' Under Five', case=False) \
.str.title() \
.str.replace('|'.join(common_types), '', regex=True, case = False) \
.str.strip()

In [76]:
#mfl.sample(10)

In [77]:
mfl_cleaned = mfl[['#', 'District', 'Chiefdom', 'Facility Name',
                  'name_short', 'Facility Type', 'Council']]
mfl_cleaned.rename(columns={'#':'ID', 'name_short':'Short Name'}, inplace=True)
mfl_cleaned['Source'] = 'MFL DPPI Mar 2020'

In [78]:
mfl_cleaned['District'] = mfl_cleaned['District'].str.replace('district','',case=False)\
.str.strip()

In [79]:
mfl_cleaned['Chiefdom'] = mfl_cleaned['Chiefdom'].str.replace('chiefdom','',case=False)\
.str.replace('area','',case=False)\
.str.replace('1','I')\
.str.replace('2','II')\
.str.replace('3','III')\
.str.strip().str.title()

In [80]:
mfl_cleaned.head()

Unnamed: 0,ID,District,Chiefdom,Facility Name,Short Name,Facility Type,Council,Source
0,1,Tonkolili,Polie,Mabarr Polie,Mabarr Polie,MCHP,Tonkolili District Council,MFL DPPI Mar 2020
1,2,Tonkolili,Yele,Mafay,Mafay,MCHP,Tonkolili District Council,MFL DPPI Mar 2020
2,3,Tonkolili,Mayepoi,Makonkorie,Makonkorie,CHP,Tonkolili District Council,MFL DPPI Mar 2020
3,4,Tonkolili,Yele,Manowo,Manowo,CHC,Tonkolili District Council,MFL DPPI Mar 2020
4,5,Tonkolili,Masankong,Mansumana,Mansumana,CHP,Tonkolili District Council,MFL DPPI Mar 2020


In [83]:
mfl_cleaned[mfl_cleaned.duplicated(subset=['District','Chiefdom','Short Name'], keep=False)]

Unnamed: 0,ID,District,Chiefdom,Facility Name,Short Name,Facility Type,Council,Source
1156,1157,Bombali,Mara,Mara,Mara,MCHP,Bombali District Council,MFL DPPI Mar 2020
1157,1158,Bombali,Mara,Mara,Mara,CHC,Bombali District Council,MFL DPPI Mar 2020


In [81]:
mfl_cleaned.to_csv(dataDir + "\\Cleaned Data\\Overall\\MFL DPPI Mar 2020_cleaned_0814.csv",
                  index=False)

# FMFL with GPS Coordinates from MoHS

## 1. List of fields

In [127]:
print("The fields are", fmfl.columns)

The fields are Index(['Unnamed: 0', 'FacilityName', 'Location', 'Region', 'District',
       'Facility Type', 'Ownership', 'Urban / Rural ', 'Province', 'Chiefdom',
       'Facility status as at 31/03/2015  (open = 1, closed = 2) ',
       'GPS_latititude', 'GPS_longitude', 'GPS_altitude', 'GPS_precision',
       'GPS_location (all)'],
      dtype='object')


- `Unnamed: 0`: row number starting from 0.
- `FacilityName`: facility name. 
- `Location`
- `Region`: values include South, North, East, Western
- `District`: admin 1 name.
- `Facility Type`: facility type.
- `Ownership`: values include Govt, NGO, Private, Mission, Other.
- `Urban / Rural `: either Urban or Rural.
- `Province`: values include Southern, Eastern, Northern, Western, may be same as `Region`. 
- `Chiefdom`: admin 2 name.
- `Facility status as at 31/03/2015  (open = 1, closed = 2) `
- `GPS_latititude`: latitude.
- `GPS_longitude`: longitude.
- `GPS_altitude`: altitude. 
- `GPS_precision`: score ranges from 3 to 52.
- `GPS_location (all)`: combination of latitude, longitude, altitude and precision.

Some values in text columns like `District`, `Ownership` have an extra whitespace at the end or beginning. Corrections are made.

In [128]:
fmfl['FacilityName'] = fmfl['FacilityName'].str.strip()
fmfl['District'] = fmfl['District'].str.strip()
fmfl['Ownership'] = fmfl['Ownership'].str.strip()

In [129]:
# Remove , in values like 1,675 so that altitude is encoded as a float
fmfl['GPS_altitude'] = fmfl['GPS_altitude'].str.replace(',','')
fmfl['GPS_altitude'] = pd.to_numeric(fmfl['GPS_altitude'])

## 2. Missing values

In [130]:
print("NA values by column")
pd.DataFrame({'count':fmfl.isna().sum(), 
              'percentage':round(fmfl.isna().sum() / fmfl.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
Unnamed: 0,0,0.0
FacilityName,0,0.0
Location,27,2.0
Region,0,0.0
District,0,0.0
Facility Type,0,0.0
Ownership,45,3.4
Urban / Rural,45,3.4
Province,660,49.7
Chiefdom,660,49.7


## 3. Number of data points

In [131]:
print("Number of data points collected:", fmfl.shape[0])

Number of data points collected: 1329


## 4. Facility types

In [132]:
# Some corrections
# upper case everything
# replace HOSP with HOSPITAL
fmfl[type_dict['fmfl']] = fmfl[type_dict['fmfl']].str.upper().str.replace('^HOSP$', 'HOSPITAL')

In [133]:
print('Dataset: FMFL from MoHS')
print( "Number of unique facility types:", fmfl[type_dict['fmfl']].nunique())
fmfl[type_dict['fmfl']].value_counts()

Dataset: FMFL from MoHS
Number of unique facility types: 7


MCHP        639
CHP         341
CHC         231
HOSPITAL     58
CLINIC       54
OTHER         5
6             1
Name: Facility Type, dtype: int64

## 5. Facility names

### 5.1 How many of the facility names are empty, both null or no text?

In [134]:
print("Number of null values:", fmfl[name_dict['fmfl']].isna().sum())

Number of null values: 0


In [135]:
print("Number of no text values:", sum(fmfl[name_dict['fmfl']] == ""))

Number of no text values: 0


### 5.2 Name cleaning

In [136]:
fmfl['name_short'] = fmfl[name_dict['fmfl']].str.replace('_',' ') \
.str.replace('.','') \
.str.replace(' Hosp | Hosp$',' Hospital ') \
.str.replace(' Govt | Govt$',' Government ') \
.str.replace(' Gov | Gov$',' Government ') \
.str.title() \
.str.replace('|'.join(common_types), '', regex=True, case = False) \
.str.strip()

In [140]:
#fmfl[fmfl['Facility Type']=='HOSPITAL'][['District', 'Chiefdom', 'FacilityName', 'Facility Type', 'name_short']].sample(10)

In [141]:
fmfl_cleaned = fmfl.dropna(subset=['GPS_latititude', 'GPS_longitude'])
#fmfl_cleaned[['Unnamed: 0', 'FacilityName', 'Region', 'District',
#       'Facility Type', 'Ownership', 'Chiefdom',
#       'GPS_latititude', 'GPS_longitude',  'name_short']].to_csv(dataDir+'\\'+'fmfl2_0810.csv', index=False)

## 6. Duplicates within dataset

In [137]:
fmfl.groupby(['District', 'FacilityName', 'Facility Type', 'GPS_latititude',
                            'GPS_longitude']) \
.agg(count=('Unnamed: 0', 'count')).sort_values('count', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
District,FacilityName,Facility Type,GPS_latititude,GPS_longitude,Unnamed: 5_level_1
Kenema,Sembehun,MCHP,8.020815,-11.327827,3
Pujehun,Saahun,MCHP,7.487952,-11.41776,2
Pujehun,Nyandehun,MCHP,7.224288,-11.709803,2
Moyamba,Gondama,MCHP,7.751333,-12.134017,2
Pujehun,Taninahun,MCHP,7.54858,-11.804628,2


In [138]:
# drop duplicates
fmfl_cleaned = fmfl[~fmfl.duplicated(subset=['District', 'FacilityName', 'Facility Type', 
                             'GPS_latititude', 'GPS_longitude'],
                    keep='last')]

In [139]:
print("Number of duplicates dropped:", fmfl.shape[0]-fmfl_cleaned.shape[0])

Number of duplicates dropped: 23


In [140]:
region_dict = {'South':'Southern', 'North':'Northern', 
               'East':'Eastern', 'Western':'Western'}
fmfl_cleaned['Region'] = [region_dict[reg] for reg in fmfl_cleaned['Region']]

In [141]:
fmfl_cleaned = fmfl_cleaned[['Unnamed: 0', 'Region', 'District', 'Chiefdom', 'FacilityName', 
                            'name_short', 'Facility Type', 'GPS_latititude', 
                             'GPS_longitude', 'GPS_altitude', 'GPS_precision',
                            'Ownership', 'Urban / Rural ', 
                             'Facility status as at 31/03/2015  (open = 1, closed = 2) ']]
fmfl_cleaned['Source'] = 'FMFL from MoHS'

In [142]:
fmfl_cleaned.rename(columns={'Unnamed: 0':'ID', 'FacilityName':'Facility Name',
                            'GPS_latititude':'Latitude', 'GPS_longitude':'Longitude',
                             'GPS_altitude':'Altitude', 'GPS_precision':'GPS Accuracy',
                            'Facility status as at 31/03/2015  (open = 1, closed = 2) ':'Functional Status',
                            'Urban / Rural ':'Urban/Rural', 
                            'name_short':'Short Name'},
                   inplace=True)

In [143]:
#fmfl_cleaned.head()

In [144]:
district_dict = {'Western Rural':'Western Area Rural', 
                 'Western Urban':'Western Area Urban'}
fmfl_cleaned['District'] = [district_dict[dist] if dist in district_dict.keys() \
                            else dist for dist in fmfl_cleaned['District']]

In [167]:
fmfl_cleaned.to_csv(dataDir + "\\Cleaned Data\\Overall\\fmfl_cleaned_0812.csv")

In [168]:
fmfl_geo = fmfl_cleaned.dropna(subset=['Longitude','Latitude'])
fmfl_geo.to_csv(dataDir + "\\Cleaned Data\\No missing coordinates\\fmfl_geo_cleaned_0812.csv")

# MFL from Unicef and MoHS

## Number of data points

In [145]:
print("Unicef:", mfl.shape[0])
print("MoHS:", fmfl_cleaned.shape[0])

Unicef: 1394
MoHS: 1306


In [147]:
fmfl_cleaned.columns

Index(['ID', 'Region', 'District', 'Chiefdom', 'Facility Name', 'Short Name',
       'Facility Type', 'Latitude', 'Longitude', 'Altitude', 'GPS Accuracy',
       'Ownership', 'Urban/Rural', 'Functional Status', 'Source'],
      dtype='object')

In [148]:
mfl_cleaned.columns

Index(['ID', 'District', 'Chiefdom', 'Facility Name', 'Short Name',
       'Facility Type', 'Council', 'Source'],
      dtype='object')

## Find overlap

In [149]:
fmfl2 = fmfl_cleaned[['District', 'Chiefdom', 'Facility Name', 'Facility Type', 'Short Name',
                     'Source']]

mfl2 = mfl_cleaned[['District', 'Chiefdom', 'Facility Name', 'Facility Type', 'Short Name', 'Source']]

In [180]:
#fmfl2['District'].unique()

In [151]:
#mfl2['District'].unique()

In [152]:
# match on short name
short_name = mfl2.merge(fmfl2, on = ['District', 'Short Name'],
          suffixes=['_mfl', '_fmfl'])
print("Match on district and short name:")
print("Number of points overlap:", short_name.shape[0])

Match on district and short name:
Number of points overlap: 793


In [154]:
fac_type_dict = {'MCHP':'MCHP', 'CHP':'CHP', 'CHC':'CHC', 
                 'HOSPITAL':'HOSPITAL', 'CLINIC':'CLINIC', 
                 'REFERRAL HOSPITAL':'HOSPITAL',
                 'DUNAMOR':'OTHER', 'BEMONC - CHC':'CHC', 
                 'GOVERNMENT HOSPITAL':'HOSPITAL',
                 'MISSION HOSPITAL':'HOSPITAL', 
                 'INDUSTRIAL HOSPITAL':'HOSPITAL', 
                 'MAMAGEWOH':'OTHER', np.nan:np.nan}
mfl2['Facility Type2'] = [fac_type_dict[ftype] for ftype in mfl2['Facility Type']]

In [155]:
# match on exact facility name
df = mfl2.merge(fmfl2, left_on = ['Short Name', 'Facility Type2'],
                right_on = ['Short Name', 'Facility Type'],
          suffixes=['_mfl', '_fmfl'])
print("Match on short name and facility type:")
print("Number of points overlap:", df.shape[0])

Match on short name and facility type:
Number of points overlap: 727


In [197]:
#df[df['Facility Type2']=='HOSPITAL'].head()

# Matching DPPI 2020 list and SARA data

In [156]:
# drop 'district' from District column in DPPI 2020 list
mfl2 = mfl_cleaned[['District', 'Chiefdom', 'Facility Name', 'Facility Type', 'Short Name']]

In [157]:
sara2 = sara_cleaned[['District', 'Facility Name', 'Facility Type', 'Short Name',
             'Latitude', 'Longitude']]

In [159]:
# match on district and exact facility name
df = mfl2.merge(sara2, on = ['District', 'Short Name'],
          suffixes=['_mfl', '_sara'])
print("Match on district and short name:")
print("Number of points overlap:", df.shape[0])

Match on district and short name:
Number of points overlap: 777


In [160]:
fac_type_dict = {'MCHP':'MCHP', 'CHP':'CHP', 'CHC':'CHC', 
                 'HOSPITAL':'HOSPITAL', 'CLINIC':'CLINIC', 
                 'REFERRAL HOSPITAL':'HOSPITAL',
                 'DUNAMOR':'OTHER', 'BEMONC - CHC':'CHC', 
                 'GOVERNMENT HOSPITAL':'HOSPITAL',
                 'MISSION HOSPITAL':'HOSPITAL', 
                 'INDUSTRIAL HOSPITAL':'HOSPITAL', 
                 'MAMAGEWOH':'OTHER', np.nan:np.nan}
mfl2['Facility Type2'] = [fac_type_dict[ftype] for ftype in mfl2['Facility Type']]

In [161]:
# match on short name and facility type
df = mfl2.merge(sara2, left_on = ['Short Name', 'Facility Type2'],
                right_on = ['Short Name', 'Facility Type'],
          suffixes=['_mfl', '_sara'])
print("Match on short name and facility type:")
print("Number of points overlap:", df.shape[0])

Match on short name and facility type:
Number of points overlap: 691


In [163]:
# match on short name and facility type
df = mfl2.merge(sara2, left_on = ['District', 'Short Name', 'Facility Type2'],
                right_on = ['District', 'Short Name', 'Facility Type'],
          suffixes=['_mfl', '_sara'])
print("Match on district, short name and facility type:")
print("Number of points overlap:", df.shape[0])

Match on district, short name and facility type:
Number of points overlap: 613
