In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import os
from folium.plugins import MarkerCluster

To download new data, go to https://healthsites.io/map

In [3]:
dataDir = r"C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\GRID3\Healthsites"

In [4]:
priority_countries = ['South Sudan', 'Mozambique', 'Namibia', 'Nigeria', 'Zambia',
                      'Sierra Leone', 'Ghana',  'Burkina Faso', 'Ethiopia', 'Somalia',
                     'Rwanda', 'Kenya', 'Zimbabwe', 'Democratic Republic of the Congo']

In [5]:
dfs = []
for i in range(len(priority_countries)):
    country = priority_countries[i]
    filename = country + '-node.shp'
    path = os.path.join(dataDir, country, filename)
    df = gpd.read_file(path)
    df['country'] = country
    dfs.append(df)

In [6]:
df = pd.concat(dfs, axis=0)
df.reset_index(drop=True, inplace=True)

In [7]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,osm_id,amenity,healthcare,name,operator,source,speciality,operator_ty,contact_num,operational,opening_hou,beds,staff_docto,staff_nurse,health_amen,dispensing,wheelchair,emergency,insurance,water_sourc,electricity,is_in_healt,url,addr_housen,addr_street,addr_postco,addr_city,changeset_i,changeset_v,changeset_t,changeset_u,geometry,country
0,2567047420,clinic,,Nagishot Clinic,,,,,,,,,,,,,,,,,,,,,,,,19296554,1,2013-12-05 21:11:57,yaho,POINT (33.56641 4.26735),South Sudan
1,2651471128,doctors,,Wau Shilluk Clinic,,IRNA,,,,,,,,,,,,,,,,,,,,,,20347773,1,2014-02-03 07:25:25,MSF-Fieldco,POINT (31.74457 9.66147),South Sudan
2,2766827025,hospital,,,,,,,,,,,,,,,,,,,,,,,,,,21492839,1,2014-04-04 11:21:31,Masiya,POINT (28.40755 4.56646),South Sudan
3,2766848998,pharmacy,,Masiya Pharmacy,,,,,,,,,,,,yes,,,,,,,,,,,,21492839,1,2014-04-04 11:29:21,Masiya,POINT (28.38448 4.57701),South Sudan
4,2787812661,clinic,,,,survey,,ngo,,,,,,,,,,,,,,,,,,,,21664128,1,2014-04-13 12:23:20,LaurentS,POINT (33.74447 9.98283),South Sudan


In [8]:
# Define some variables
country_col = 'country'
name_col = 'name'
type_col = 'amenity'

# 1. What is the list of fields?

In [9]:
print("The fields are", df.columns)

The fields are Index(['osm_id', 'amenity', 'healthcare', 'name', 'operator', 'source',
       'speciality', 'operator_ty', 'contact_num', 'operational',
       'opening_hou', 'beds', 'staff_docto', 'staff_nurse', 'health_amen',
       'dispensing', 'wheelchair', 'emergency', 'insurance', 'water_sourc',
       'electricity', 'is_in_healt', 'url', 'addr_housen', 'addr_street',
       'addr_postco', 'addr_city', 'changeset_i', 'changeset_v', 'changeset_t',
       'changeset_u', 'geometry', 'country'],
      dtype='object')


# 2. What does each describe?

This website lists the data model used by the dataset and description of most attributes.

https://wiki.openstreetmap.org/wiki/Global_Healthsites_Mapping_Project#Data_Model

- `osm_id`: some unique id

- `amenity`: facility type. Unique values include doctors, hospital, pharmacy, dentist and clinic.

- `healthcare`: A key to tag all places that provide healthcare (are part of the healthcare sector)

- `name`: name of health facility

- `operator`: The operator tag is used to name a company, corporation, person or any other entity who is directly in charge of the current operation of a map object.

- `source`: Used to indicate the source of information (i.e. meta data) added to OpenStreetMap

- `operator_ty`: operator type.

- `contact_num`: contact number

- `operational`: operational status (operational, non_operational, unknown). Used to document an observation of the current functional status of a mapped feature.

- `opening_hou`: opening hours. Describes when something is open or closed. There is a specific standard format for this data https://wiki.openstreetmap.org/wiki/Key:opening_hours/specification

- `beds`: Indicates the number of beds in a hotel or hospital
- `staff_docto`: Indicates the number of doctors in a hotel or hospital
- `staff_nurse`: Indicates the number of nurses in a hotel or hospital
- `health_amen`: healthcare equipment. Indicates what type of speciality medical equipment is available at the health facility.

- `dispensing`: yes/no. Whether a pharmacy dispenses prescription drugs or not. Used to add information to something that is already tagged as amenity=pharmacy

- `wheelchair`: yes/no. Used to mark places or ways that are suitable to be used with a wheelchair and a person with a disability who uses another mobility device (like a walker)

- `emergency`: yes/no. This key describes various emergency services.

- `insurance`: This key describes the type of health insurance accepted at the healthsite.

- `water_sourc`: Used to indicate the source of the water for features that provide or use water.

- `electricity`: Used to indicate the source of the power generated

- `is_in_healt`: yes/no variable. A tag to describe the level of the health division, working in a similar way as admin_level=\*. There are level 2, 4, 6, 8.

- `url`: Specifying a url related to a feature, in this case the wiki page if it is linked to an organised mapping effort, both through surveying and importing

- `addr_housen`: house number of address

- `addr_street`: street address

- `addr_postco`: post code.

- `addr_city`: city

- `changeset_i`: some id?

- `changeset_v`: integer. ?

- `changeset_t`: time where the change takes place

- `changeset_u`: user who made the change.

- `geometry`: geometry type and coordinates.

- `country`: column added to keep track of the country the data point belongs to.

In [10]:
# Are all values in osm_id unique?
df['osm_id'].nunique() == df.shape[0]

True

In [11]:
# Confirm the unique values of amenity match the data description
df['amenity'].unique()

array(['clinic', 'doctors', 'hospital', 'pharmacy', 'dentist', None],
      dtype=object)

# 3. Are they all populated?

No. Detailed attributes of the health facilities have around 90% values missing.

In [12]:
print("NA values by column")
pd.DataFrame({'count':df.isna().sum(), 'percentage':round(df.isna().sum() / df.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
osm_id,0,0.0
amenity,50,0.7
healthcare,5943,80.3
name,615,8.3
operator,6943,93.8
source,4454,60.2
speciality,7208,97.4
operator_ty,6412,86.6
contact_num,7302,98.7
operational,7157,96.7


# 4. Is there information on when the data was collected?  If so, what is the date range?

All datasets except for Zimbabwe are last updated on Fri Jun 19 2020 at the time of download. The dataset for Zimbabwe is last updated on Mon Jun 29 2020.

In [13]:
df['changeset_t_dt'] = pd.to_datetime(df['changeset_t'])
df['date'] = df['changeset_t_dt'].dt.date

print("date range by country:")
df.groupby(country_col).agg(start=('date', 'min'),
                            end=('date', 'max'))

date range by country:


Unnamed: 0_level_0,start,end
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Burkina Faso,2009-01-05,2020-06-09
Democratic Republic of the Congo,2009-07-11,2020-06-16
Ethiopia,2009-12-17,2020-06-17
Ghana,2010-06-23,2020-06-19
Kenya,2007-06-11,2020-06-06
Mozambique,2010-09-08,2020-06-18
Namibia,2010-10-31,2020-06-16
Nigeria,2009-09-19,2020-06-19
Rwanda,2011-08-10,2019-11-10
Sierra Leone,2014-10-11,2020-05-17


# 5. How many countries does the dataset cover?

In [14]:
print("Number of countries covered:", df[country_col].nunique())

Number of countries covered: 14


# 6. What are those countries?

In [15]:
print("Countries covered:", df[country_col].unique())

Countries covered: ['South Sudan' 'Mozambique' 'Namibia' 'Nigeria' 'Zambia' 'Sierra Leone'
 'Ghana' 'Burkina Faso' 'Ethiopia' 'Somalia' 'Rwanda' 'Kenya' 'Zimbabwe'
 'Democratic Republic of the Congo']


# 7. How many data points have been collected overall?

In [16]:
print("Number of data points collected:", df.shape[0])

Number of data points collected: 7400


# 8. How many of the facility names are empty, both null or no text?

In [17]:
print("Number of null values:", df[name_col].isna().sum())

Number of null values: 615


In [18]:
print("Number of no text values:", sum(df[name_col] == ""))

Number of no text values: 0


In [19]:
print("Number of data points:", df.shape[0])
print("Number of unique facility names:", df[name_col].nunique())

Number of data points: 7400
Number of unique facility names: 6454


In [20]:
name_len = name_col + '_len'
df[name_len] = df[name_col].str.len()

In [21]:
# facilities with shortest names
df.sort_values(by=name_len).head()

Unnamed: 0,osm_id,amenity,healthcare,name,operator,source,speciality,operator_ty,contact_num,operational,opening_hou,beds,staff_docto,staff_nurse,health_amen,dispensing,wheelchair,emergency,insurance,water_sourc,electricity,is_in_healt,url,addr_housen,addr_street,addr_postco,addr_city,changeset_i,changeset_v,changeset_t,changeset_u,geometry,country,changeset_t_dt,date,name_len
7144,6827857141,doctors,,A,,,,,,,,,,,,,,,,,,,,,,,,74965213,1,2019-09-26 14:11:27,JerryCho,POINT (18.43736 -5.66682),Democratic Republic of the Congo,2019-09-26 14:11:27,2019-09-26,1.0
4025,5225074922,clinic,,ጀሞ,,,,,,,,,,,,,,,,,,,,,,,,53719246,1,2017-11-12 17:43:18,Amanuel minda,POINT (38.67995 8.99832),Ethiopia,2017-11-12 17:43:18,2017-11-12,2.0
2649,5535787467,pharmacy,,CP,,,,,,,,,,,,,,,,,,,,,,,,57890199,1,2018-04-07 12:21:39,Enock4seth,POINT (-0.36557 6.22190),Ghana,2018-04-07 12:21:39,2018-04-07,2.0
3248,7214156287,clinic,,Mw,,,,,,,,,,,,,,,,,,,,,,,,81044760,1,2020-02-15 14:29:10,Arnson,POINT (-2.64177 7.22064),Ghana,2020-02-15 14:29:10,2020-02-15,2.0
4063,5661188743,pharmacy,,ባታ,,,,,,,,,,,,,,,,,,,,,,,,59510011,1,2018-06-03 14:13:32,Samuel Berhanu,POINT (37.37015 11.60584),Ethiopia,2018-06-03 14:13:32,2018-06-03,2.0


# 9. How many data points have been collected for each country?

In [23]:
counts_by_country = df[country_col].value_counts()
print("Raw counts for each priority country:")
print(counts_by_country)

Raw counts for each priority country:
Democratic Republic of the Congo    1848
Ghana                               1001
Nigeria                              976
Kenya                                923
Mozambique                           836
Ethiopia                             518
Burkina Faso                         476
Sierra Leone                         287
Zimbabwe                             160
Zambia                               126
Namibia                              121
South Sudan                           51
Rwanda                                50
Somalia                               27
Name: country, dtype: int64


In [24]:
print("Counts for priority countries based on number of unique facility names:")
df[df[country_col].isin(priority_countries)].groupby(country_col)\
.agg(count=(name_col, 'nunique')).sort_values('count' ,ascending=False)

Counts for priority countries based on number of unique facility names:


Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
Democratic Republic of the Congo,1702
Nigeria,914
Kenya,813
Ghana,800
Mozambique,790
Burkina Faso,422
Ethiopia,349
Sierra Leone,262
Zimbabwe,140
Zambia,105


# 10. What are the “types” used for each priority country(listed below)? 
Each health Facility is assigned a type such as Hospital, Health Facility, etc. Please list the unique types and count for each.

The facilities are classified in the same way for all countries. The unique values are clinic, dentist, doctors, hospital and pharmacy. However, certain facility types might not be present in certain countries.

In [25]:
for country in priority_countries:
    df_ctr = df[df[country_col]==country]
    print(country + ',', "Number of unique facility types:", df_ctr[type_col].nunique())
    print(df_ctr[type_col].value_counts())
    print('\n')

South Sudan, Number of unique facility types: 4
clinic      35
hospital     8
pharmacy     7
doctors      1
Name: amenity, dtype: int64


Mozambique, Number of unique facility types: 5
clinic      699
pharmacy     72
doctors      47
hospital     15
dentist       2
Name: amenity, dtype: int64


Namibia, Number of unique facility types: 5
pharmacy    56
doctors     26
clinic      24
dentist      5
hospital     3
Name: amenity, dtype: int64


Nigeria, Number of unique facility types: 5
doctors     337
hospital    261
clinic      253
pharmacy    108
dentist       6
Name: amenity, dtype: int64


Zambia, Number of unique facility types: 5
clinic      57
pharmacy    32
hospital    29
doctors      5
dentist      3
Name: amenity, dtype: int64


Sierra Leone, Number of unique facility types: 5
clinic      154
pharmacy    112
hospital     16
dentist       3
doctors       1
Name: amenity, dtype: int64


Ghana, Number of unique facility types: 5
pharmacy    678
clinic      215
doctors      51
hospi

# 11. Make a map of the dataset, how does the data look? Does it all fall within Africa?

Since trying to plot all data points really slows things down, 1000 data points are randomly sampled and plotted on the map. The process is repeated by changing the random state to view different data points.

It looks like the sampled data points all fall within Africa. There are a few points that might be on some islands near the African continent.

In [26]:
# Re-project to WGS84
df['geometry'] = df['geometry'].to_crs(epsg=4326)

In [27]:
# Define some global variables
CENTER_POINTS = {}
# overall
CENTER_POINTS['Overall'] = [0.8560,21.5125]
# country with lowest number of observations, used to test things out
CENTER_POINTS['Somalia'] = [6.8864,45.1807]
# Countries with the largest number of observations
CENTER_POINTS['Ghana'] = [8.1813, -1.1766]

CENTER_POINTS['Namibia'] = [-22.5644, 17.0718]

In [28]:
def make_map(zoom_start, sample_size, random_state, df):
    m = folium.Map(location=CENTER_POINTS['Overall'], 
                   zoom_start=zoom_start, control_scale=True)

    # Create a Clustered map where points are clustered
    marker_cluster = MarkerCluster().add_to(m)
    # Create health facilities as points on top of the map
    for idx, row in df.sample(sample_size, random_state=random_state).iterrows():
        # Get lat and lon of points
        lon = row['geometry'].x
        lat = row['geometry'].y

        # Add marker to the map
        folium.Circle(location=[lat, lon], 
                      radius=2).add_to(m) 
    return m

In [29]:
#make_map(zoom_start=2, sample_size=1000, random_state=25, df=df)

In [30]:
#make_map(zoom_start=3, sample_size=1000, random_state=50, df=df)

In [31]:
make_map(zoom_start=3, sample_size=1000, random_state=75, df=df)

In [32]:
#m=make_map(zoom_start=3, sample_size=1000, random_state=75, df=df)
#outfp = r'Healthsites_map.html'
#m.save(outfp)

# 12. Zooming into the map do you see multiple data points collected for the same location?

Zoom into one specific country Ghana to see if there are overlapping points since it has the greatest number of health facilities.

A map with location markers and a screenshot with overlapping data points are included below. 

In the screenshot, there are 2 pairs of points that are very close to each other. For the pair on the upper left, they have the very similar names: Linivien Pharmacy and Liniven Pharmacy (only one character difference). For the other pair on the lower right, the two facilities are very close but have different names: Zoe Pharmacy and Mini Clinic.

In [38]:
m = folium.Map(location=CENTER_POINTS['Ghana'], 
                zoom_start=6, control_scale=True)

# Create a Clustered map where points are clustered
marker_cluster = MarkerCluster().add_to(m)
# Create health facilities as points on top of the map
for idx, row in df[df[country_col]=='Ghana'].iterrows():
        # Get lat and lon of points
    lon = row['geometry'].x
    lat = row['geometry'].y
    
    facility_name = row[name_col]

    # Add marker to the map
    folium.Marker(location=[lat, lon], popup=facility_name,
                      radius=2).add_to(m) 
m

![](maps/Healthsites_12.png)

In [42]:
# overlapping points with very similar facility name
df[(df[name_col]=='Linivien Pharmacy') | (df[name_col]=='Liniven Pharmacy')]

Unnamed: 0,osm_id,amenity,healthcare,name,operator,source,speciality,operator_ty,contact_num,operational,opening_hou,beds,staff_docto,staff_nurse,health_amen,dispensing,wheelchair,emergency,insurance,water_sourc,electricity,is_in_healt,url,addr_housen,addr_street,addr_postco,addr_city,changeset_i,changeset_v,changeset_t,changeset_u,geometry,country,changeset_t_dt,date,name_len
2678,5732666951,pharmacy,,Linivien Pharmacy,,,,,,,,,,,,,,,,,,,,,,,,60329857,1,2018-07-01 20:32:53,Enock4seth,POINT (-0.19156 5.67582),Ghana,2018-07-01 20:32:53,2018-07-01,17.0
2709,5809255656,pharmacy,,Liniven Pharmacy,,,,,,,,,,,,,,,,,,,,,,,,61359670,1,2018-08-04 18:33:54,mawutor,POINT (-0.19137 5.67574),Ghana,2018-08-04 18:33:54,2018-08-04,16.0


# 13. What other observation can be made when looking at the map?

In Eastern Africa, there are many health facilities located along the coastline. The same holds for Ghana as well.

![](maps/Healthsites_13.1.png)

![](maps/Healthsites_13.2.png)