In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import numpy as np
from folium.plugins import MarkerCluster

In [2]:
dataDir = r'C:\\Users\\DUANYUEYUN\\Documents\\ArcGIS\\Projects\\GRID3\\Sub-Saharan_health_facilities.gdb'

In [3]:
df = gpd.read_file(dataDir, driver='FileGDB', layer='HDX_WHO_sub_saharan_health_facilities')
df = df[df['Country']=='Sierra Leone']

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
# Define some variables
country_col = 'Country'
name_col = 'Facility_n'
type_col = 'Facility_t'

# 1. What is the list of fields?

In [6]:
print("The fields are", df.columns)

The fields are Index(['Country', 'Admin1', 'Facility_n', 'Facility_t', 'Ownership', 'Lat',
       'Long', 'LL_source', 'geometry'],
      dtype='object')


# 2. What does each describe?


- `Country`: country where the facility is located.
- `Admin1`: first level administrative division.
- `Facility_n`: facility name.
- `Facility_t`: facility type.
- `Ownership`: type of organization / institution that owns the facility.
- `Lat`: latitude.
- `Long`: longitude.
- `LL_source`: source of the coordinates.
- `geometry`: contains the geometric object.

# 3. Are they all populated?

Observation: some values in the text columns are simply a single whitespace " ", which should be encoded as na.

There are also some rows where latitude and longitude are both 0 and the geometry object contains very extreme values. Thus, coordinates might be missing for those points.

For Sierra Leone, all fields are populated.

## Text columns

In [7]:
categorical_cols = df.columns[df.dtypes == object]

In [8]:
# correct for the null values
for col in categorical_cols:
    new_col = col + '_corr'
    df[new_col] = df[col].replace(' ', np.nan)

## Location

In [11]:
Lat_corr = []
for index, row in df.iterrows():
    if (row['Lat']==0) and (row['Long'==0]):
        Lat_corr.append(np.nan)
    else:
        Lat_corr.append(row['Lat'])
        
Long_corr = []
for index, row in df.iterrows():
    if (row['Lat']==0) and (row['Long'==0]):
        Long_corr.append(np.nan)
    else:
        Long_corr.append(row['Long'])        

In [12]:
df['Lat_corr'] = Lat_corr
df['Long_corr'] = Long_corr

In [13]:
print("NA values by column")
pd.DataFrame({'count':df.isna().sum(), 'percentage':round(df.isna().sum() / df.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
Country,0,0.0
Admin1,0,0.0
Facility_n,0,0.0
Facility_t,0,0.0
Ownership,0,0.0
Lat,0,0.0
Long,0,0.0
LL_source,0,0.0
geometry,0,0.0
Country_corr,0,0.0


# 4. Is there information on when the data was collected?  If so, what is the date range?

There is no column related to time in the data set. According to the website where the data is posted, the dataset is dated Jul 25, 2019 and was updated on Feb 20, 2020.

Data website: https://data.humdata.org/dataset/health-facilities-in-sub-saharan-africa

# 5. How many data points have been collected overall?

In [14]:
print("Number of data points collected:", df.shape[0])

Number of data points collected: 1120


# 6. How many of the facility names are empty, both null or no text?

In [15]:
print("Number of null values:", df[name_col].isna().sum())

Number of null values: 0


In [16]:
print("Number of no text values:", sum(df[name_col] == ""))

Number of no text values: 0


In [17]:
print("Number of data points:", df.shape[0])
print("Number of unique facility names:", df[name_col].nunique())

Number of data points: 1120
Number of unique facility names: 1085


# 7. What are the “types” used? 
Each health Facility is assigned a type such as Hospital, Health Facility, etc. Please list the unique types and count for each.

In [22]:
print( "Number of unique facility types:", df[type_col].nunique())
print(df[type_col].value_counts())

Number of unique facility types: 9
Maternal & Child Health Post    619
Community Health Post           217
Community Health Centre         206
Health Post                      25
Hospital                         19
Mission Hospital                 12
Clinic                           11
Health Centre                    10
Refferal Hospital                 1
Name: Facility_t, dtype: int64


# 8. Make a map of the dataset, how does the data look? Does it all fall within Africa?

It looks like all the data points all fall within the country. There are some locations where multiple points are collected.

In [23]:
# Re-project to WGS84
df['geometry'] = df['geometry'].to_crs(epsg=4326)

In [26]:
def make_map(zoom_start, df):
    m = folium.Map(location=[8.542941, -11.783450], 
                   zoom_start=zoom_start, control_scale=True)

    # Create a Clustered map where points are clustered
    marker_cluster = MarkerCluster().add_to(m)
    # Create health facilities as points on top of the map
    for idx, row in df.iterrows():
        # Get lat and lon of points
        lon = row['geometry'].x
        lat = row['geometry'].y
        name = row[name_col]

        # Add marker to the map
        folium.Marker(location=[lat, lon], popup=name).add_to(m) 
    return m

In [28]:
make_map(zoom_start=7, df=df)

# 14. Other columns

In [33]:
# Ownership
df['Ownership_corr'].unique()

array(['Govt.', 'MoH', 'Public', 'FBO', nan, 'Privé à but non lucratif',
       'CBO', 'Publique', 'Confessionnel', 'ONG/non-lucratif', 'NGO',
       'FBO/NGO', 'Private not for profit', 'Local authority', 'MoHQL',
       'MoHSS', 'MoHL', 'MoHL/NGO', 'Parastatal', 'MoHCDGEC', 'NGO/FBO'],
      dtype=object)

In [34]:
# LL_source
df['LL_source_corr'].unique()

array(['Google Earth', nan, 'Combination', 'Other', 'Encarta', 'GPS',
       'Fallingrain', 'Geonames', 'Digitized from online map',
       'OpenStreetMap', 'GeoNames', 'google earth', 'combination',
       'Google earth', 'google Earth'], dtype=object)