In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import numpy as np
from folium.plugins import MarkerCluster

In [2]:
dataDir = r'C:\\Users\\DUANYUEYUN\\Documents\\ArcGIS\\Projects\\GRID3\\Sub-Saharan_health_facilities.gdb'
fiona.listlayers(dataDir)

['ISS_sub_saharan', 'HDX_WHO_sub_saharan_health_facilities']

In [3]:
df = gpd.read_file(dataDir, driver='FileGDB', layer='HDX_WHO_sub_saharan_health_facilities')

In [4]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Country,Admin1,Facility_n,Facility_t,Ownership,Lat,Long,LL_source,geometry
0,Angola,Bengo,Hospital Barra Do Dande,Hospital,Govt.,-8.656,13.4919,Google Earth,POINT (13.49190 -8.65600)
1,Angola,Bengo,Hospital Dos Dembos,Hospital,Govt.,-8.5026,14.5862,Google Earth,POINT (14.58620 -8.50260)
2,Angola,Bengo,Hospital Municipal de Ambriz,Municipal Hospital,Govt.,-7.8522,13.1307,Google Earth,POINT (13.13070 -7.85220)
3,Angola,Bengo,Hospital Municipal de Bula Atumba,Municipal Hospital,Govt.,-8.6742,14.7925,Google Earth,POINT (14.79250 -8.67420)
4,Angola,Bengo,Hospital Municipal de Dande,Municipal Hospital,Govt.,-8.5835,13.6569,Google Earth,POINT (13.65690 -8.58350)


In [5]:
# Define some variables
country_col = 'Country'
name_col = 'Facility_n'
type_col = 'Facility_t'
priority_countries = ['South Sudan', 'Mozambique', 'Namibia', 'Nigeria', 'Zambia',
                      'Sierra Leone', 'Ghana',  'Burkina Faso', 'Ethiopia', 'Somalia', 
                      'Rwanda', 'Kenya', 'Zimbabwe', 'Democratic Republic of the Congo']

# 1. What is the list of fields?

In [6]:
print("The fields are", df.columns)

The fields are Index(['Country', 'Admin1', 'Facility_n', 'Facility_t', 'Ownership', 'Lat',
       'Long', 'LL_source', 'geometry'],
      dtype='object')


# 2. What does each describe?


- `Country`: country where the facility is located.
- `Admin1`: first level administrative division.
- `Facility_n`: facility name.
- `Facility_t`: facility type.
- `Ownership`: type of organization / institution that owns the facility.
- `Lat`: latitude.
- `Long`: longitude.
- `LL_source`: source of the coordinates.
- `geometry`: contains the geometric object.

# 3. Are they all populated?

Observation: some values in the text columns are simply a single whitespace " ", which should be encoded as na.

After making the correction, `Ownership` and `LL_source` have missing values; other columns are all populated.

There are some rows where latitude and longitude are both 0 and the geometry object contains very extreme values.

## Text columns

In [7]:
categorical_cols = df.columns[df.dtypes == object]

In [8]:
# correct for the null values
for col in categorical_cols:
    new_col = col + '_corr'
    df[new_col] = df[col].replace(' ', np.nan)

## Location

In [13]:
# sample 10 rows where latitude and longitude are both 0
df[(df['Lat']==0) & (df['Long']==0)].shape

(2350, 17)

In [18]:
Lat_corr = []
for index, row in df.iterrows():
    if (row['Lat']==0) and (row['Long'==0]):
        Lat_corr.append(np.nan)
    else:
        Lat_corr.append(row['Lat'])
        
Long_corr = []
for index, row in df.iterrows():
    if (row['Lat']==0) and (row['Long'==0]):
        Long_corr.append(np.nan)
    else:
        Long_corr.append(row['Long'])        

In [19]:
df['Lat_corr'] = Lat_corr
df['Long_corr'] = Long_corr

In [20]:
print("NA values by column")
pd.DataFrame({'count':df.isna().sum(), 'percentage':round(df.isna().sum() / df.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
Country,0,0.0
Admin1,0,0.0
Facility_n,0,0.0
Facility_t,0,0.0
Ownership,0,0.0
Lat,0,0.0
Long,0,0.0
LL_source,0,0.0
geometry,0,0.0
Country_corr,0,0.0


In [21]:
df[(pd.isna(df['Lat_corr'])) & (~pd.isna(df['LL_source_corr']))]

Unnamed: 0,Country,Admin1,Facility_n,Facility_t,Ownership,Lat,Long,LL_source,geometry,Country_corr,Admin1_corr,Facility_n_corr,Facility_t_corr,Ownership_corr,LL_source_corr,Lat_corr,Long_corr
95648,Uganda,Western,Kyempara Health Centre II,Health Centre II,MoH,0.0,29.747,GPS,POINT (29.74700 0.00000),Uganda,Western,Kyempara Health Centre II,Health Centre II,MoH,GPS,,


# 4. Is there information on when the data was collected?  If so, what is the date range?

There is no column related to time in the data set. According to the website where the data is posted, the dataset is dated Jul 25, 2019 and was updated on Feb 20, 2020.

Data website: https://data.humdata.org/dataset/health-facilities-in-sub-saharan-africa

# 5. How many countries does the dataset cover?

In [12]:
print("Number of countries covered:", df[country_col].nunique())

Number of countries covered: 50


# 6. What are those countries?

In [13]:
print("Countries covered:", df[country_col].unique())

Countries covered: ['Angola' 'Benin' 'Botswana' 'Burkina Faso' 'Burundi' 'Cameroon'
 'Cape Verde' 'Central African Republic' 'Chad' 'Comoros' 'Congo'
 "Cote d'Ivoire" 'Democratic Republic of the Congo' 'Djibouti'
 'Equatorial Guinea' 'Eritrea' 'eSwatini' 'Ethiopia' 'Gabon' 'Gambia'
 'Ghana' 'Guinea' 'Guinea Bissau' 'Kenya' 'Lesotho' 'Liberia' 'Madagascar'
 'Malawi' 'Mali' 'Mauritania' 'Mauritius' 'Mozambique' 'Namibia' 'Niger'
 'Nigeria' 'Rwanda' 'Sao Tome and Principe' 'Senegal' 'Seychelles'
 'Sierra Leone' 'Somalia' 'South Africa' 'South Sudan' 'Sudan' 'Tanzania'
 'Togo' 'Uganda' 'Zambia' 'Zanzibar' 'Zimbabwe']


# 7. How many data points have been collected overall?

In [14]:
print("Number of data points collected:", df.shape[0])

Number of data points collected: 98745


# 8. How many of the facility names are empty, both null or no text?

In [15]:
print("Number of null values:", df[name_col].isna().sum())

Number of null values: 0


In [16]:
print("Number of no text values:", sum(df[name_col] == ""))

Number of no text values: 0


In [17]:
print("Number of data points:", df.shape[0])
print("Number of unique facility names:", df[name_col].nunique())

Number of data points: 98745
Number of unique facility names: 93503


In [18]:
name_len = name_col + '_len'
df[name_len] = df[name_col].str.len()

In [19]:
# facilities with shortest names
df.sort_values(by=name_len).head()

Unnamed: 0,Country,Admin1,Facility_n,Facility_t,Ownership,Lat,Long,LL_source,geometry,Country_corr,Admin1_corr,Facility_n_corr,Facility_t_corr,Ownership_corr,LL_source_corr,Facility_n_len
43657,Liberia,Grand Bassa,Sue,Clinic,,6.5615,-9.2151,GPS,POINT (-9.21510 6.56150),Liberia,Grand Bassa,Sue,Clinic,,GPS,3
43542,Liberia,Bong,Gou,Clinic,,7.1205,-9.7157,GPS,POINT (-9.71570 7.12050),Liberia,Bong,Gou,Clinic,,GPS,3
44185,Liberia,Sinoe,ENI,Clinic,,5.4363,-8.9317,GPS,POINT (-8.93170 5.43630),Liberia,Sinoe,ENI,Clinic,,GPS,3
44112,Liberia,Nimba,Zao,Clinic,,7.0167,-9.0333,GPS,POINT (-9.03330 7.01670),Liberia,Nimba,Zao,Clinic,,GPS,3
43605,Liberia,Gbarpolu,ULC,Clinic,,7.7912,-10.5732,GPS,POINT (-10.57320 7.79120),Liberia,Gbarpolu,ULC,Clinic,,GPS,3


# 9. How many data points have been collected for each country?

In [21]:
counts_by_country = df[country_col].value_counts()
print("Raw counts by country:")
print(counts_by_country)

Raw counts by country:
Nigeria                             20807
Democratic Republic of the Congo    14586
Tanzania                             6304
Kenya                                6146
Ethiopia                             5215
South Africa                         4303
Uganda                               3792
Cameroon                             3061
Niger                                2886
Madagascar                           2677
Ghana                                1960
Cote d'Ivoire                        1792
South Sudan                          1747
Guinea                               1746
Burkina Faso                         1721
Mozambique                           1579
Angola                               1575
Mali                                 1478
Senegal                              1347
Chad                                 1283
Zambia                               1263
Zimbabwe                             1236
Sierra Leone                         1120
Somalia    

In [22]:
# Number of data points for priority countries
print('Raw counts for priority countries only:')
counts_by_country[priority_countries].sort_values(ascending=False)

Raw counts for priority countries only:


Nigeria                             20807
Democratic Republic of the Congo    14586
Kenya                                6146
Ethiopia                             5215
Ghana                                1960
South Sudan                          1747
Burkina Faso                         1721
Mozambique                           1579
Zambia                               1263
Zimbabwe                             1236
Sierra Leone                         1120
Somalia                               879
Rwanda                                572
Namibia                               369
Name: Country, dtype: int64

In [31]:
print("Counts for priority countries based on number of unique facility names:")
df[df[country_col].isin(priority_countries)].groupby(country_col)\
.agg(count=(name_col, 'nunique')).sort_values('count' ,ascending=False)

Counts for priority countries based on number of unique facility names:


Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
Nigeria,19990
Democratic Republic of the Congo,12624
Kenya,5979
Ethiopia,4897
Ghana,1923
South Sudan,1718
Burkina Faso,1650
Mozambique,1558
Zambia,1234
Zimbabwe,1234


# 10. What are the “types” used for each priority country(listed below)? 
Each health Facility is assigned a type such as Hospital, Health Facility, etc. Please list the unique types and count for each.

In [32]:
for country in priority_countries:
    df_ctr = df[df[country_col]==country]
    print(country + ',', "Number of unique facility types:", df_ctr[type_col].nunique())
    print(df_ctr[type_col].value_counts())
    print('\n')

South Sudan, Number of unique facility types: 5
Primary Health Care Unit      1375
Primary Health Care Centre     332
County Hospital                 28
State Hospital                   9
Teaching Hospital                3
Name: Facility_t, dtype: int64


Mozambique, Number of unique facility types: 11
Centro de Saúde Rural II    982
Posto de Saúde              262
Centro de Saúde Rural I     130
Centro de Saúde Urbano B     56
Centro de Saúde Urbano C     49
Centro de Saúde Urbano A     39
Hospital Rural               29
Hospital Distrital           16
Hospital Provincial           8
Hospital Geral                5
Hospital Central              3
Name: Facility_t, dtype: int64


Namibia, Number of unique facility types: 6
Clinic                   290
Health Centre             43
District Hospital         29
Mission Hospital           3
Intermediate Hospital      3
Central Hospital           1
Name: Facility_t, dtype: int64


Nigeria, Number of unique facility types: 21
Primary Health 

# 11. Make a map of the dataset, how does the data look? Does it all fall within Africa?

Since trying to plot all data points really slows things down, 1000 data points are randomly sampled and plotted on the map. The process is repeated by changing the random state to view different data points.

It looks like the sampled data points all fall within Africa. There are a few points that might be on some islands near the African continent.

In [31]:
# Re-project to WGS84
df['geometry'] = df['geometry'].to_crs(epsg=4326)

In [71]:
# Define some global variables
CENTER_POINTS = {}
# overall
CENTER_POINTS['Overall'] = [0.8560,21.5125]
# country with lowest number of observations, used to test things out
CENTER_POINTS['Somalia'] = [6.8864,45.1807]
# Countries with the largest number of observations
CENTER_POINTS['Ghana'] = [8.1813, -1.1766]

CENTER_POINTS['Namibia'] = [-22.5644, 17.0718]

In [56]:
def make_map(zoom_start, sample_size, random_state, df):
    m = folium.Map(location=CENTER_POINTS['Overall'], 
                   zoom_start=zoom_start, control_scale=True)

    # Create a Clustered map where points are clustered
    marker_cluster = MarkerCluster().add_to(m)
    # Create health facilities as points on top of the map
    for idx, row in df.sample(sample_size, random_state=random_state).iterrows():
        # Get lat and lon of points
        lon = row['geometry'].x
        lat = row['geometry'].y

        # Add marker to the map
        folium.Circle(location=[lat, lon], 
                      radius=2).add_to(m) 
    return m

In [2]:
#make_map(zoom_start=2, sample_size=1000, random_state=25, df=df)

In [1]:
#make_map(zoom_start=3, sample_size=1000, random_state=50, df=df)

In [59]:
make_map(zoom_start=3, sample_size=1000, random_state=75, df=df)

In [55]:
#outfp = r'WHO_map3.html'
#m.save(outfp)

# 12. Zooming into the map do you see multiple data points collected for the same location?

After zooming in to some extent, the data points are some distance apart from one another. It is not obvious that multiple data points are collected for the same location. But, this could also be because only 1000 data points are sampled. There should be duplicates since the number of unique facility names is less than the number of data points in total.

In [70]:
m = folium.Map(location=CENTER_POINTS['Overall'], 
                zoom_start=3, control_scale=True)

# Create a Clustered map where points are clustered
marker_cluster = MarkerCluster().add_to(m)
# Create health facilities as points on top of the map
for idx, row in df.sample(1000, random_state=25).iterrows():
        # Get lat and lon of points
    lon = row['geometry'].x
    lat = row['geometry'].y
    
    facility_name = row[name_col]

    # Add marker to the map
    folium.Marker(location=[lat, lon], popup=facility_name,
                      radius=2).add_to(m) 
m

# 13. What other observation can be made when looking at the map?

The map below shows all health facilities in Namibia. It looks like there are many more health facilities near the Northern border as compared to the rest of the country.

In [73]:
m = folium.Map(location=CENTER_POINTS['Namibia'], 
                zoom_start=6, control_scale=True)

# Create a Clustered map where points are clustered
marker_cluster = MarkerCluster().add_to(m)
# Create health facilities as points on top of the map
for idx, row in df[df[country_col]=='Namibia'].iterrows():
        # Get lat and lon of points
    lon = row['geometry'].x
    lat = row['geometry'].y
    
    facility_name = row[name_col]

    # Add marker to the map
    folium.Circle(location=[lat, lon], popup=facility_name,
                      radius=2).add_to(m) 
m

# 14. Other columns

In [33]:
# Ownership
df['Ownership_corr'].unique()

array(['Govt.', 'MoH', 'Public', 'FBO', nan, 'Privé à but non lucratif',
       'CBO', 'Publique', 'Confessionnel', 'ONG/non-lucratif', 'NGO',
       'FBO/NGO', 'Private not for profit', 'Local authority', 'MoHQL',
       'MoHSS', 'MoHL', 'MoHL/NGO', 'Parastatal', 'MoHCDGEC', 'NGO/FBO'],
      dtype=object)

In [34]:
# LL_source
df['LL_source_corr'].unique()

array(['Google Earth', nan, 'Combination', 'Other', 'Encarta', 'GPS',
       'Fallingrain', 'Geonames', 'Digitized from online map',
       'OpenStreetMap', 'GeoNames', 'google earth', 'combination',
       'Google earth', 'google Earth'], dtype=object)