In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fiona
import geopandas as gpd
import folium
import os
from folium.plugins import MarkerCluster

To download new data, go to https://healthsites.io/map

In [2]:
dataDir = r"C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\GRID3\Healthsites"

In [3]:
country = 'Sierra Leone'
filename = country + '-node.shp'
path = os.path.join(dataDir, country, filename)
df = gpd.read_file(path)
df['country'] = country

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
# Define some variables
country_col = 'country'
name_col = 'name'
type_col = 'amenity'

# 1. What is the list of fields?

In [6]:
print("The fields are", df.columns)

The fields are Index(['osm_id', 'amenity', 'healthcare', 'name', 'operator', 'source',
       'speciality', 'operator_ty', 'contact_num', 'operational',
       'opening_hou', 'beds', 'staff_docto', 'staff_nurse', 'health_amen',
       'dispensing', 'wheelchair', 'emergency', 'insurance', 'water_sourc',
       'electricity', 'is_in_healt', 'url', 'addr_housen', 'addr_street',
       'addr_postco', 'addr_city', 'changeset_i', 'changeset_v', 'changeset_t',
       'changeset_u', 'geometry', 'country'],
      dtype='object')


# 2. What does each describe?

This website lists the data model used by the dataset and description of most attributes.

https://wiki.openstreetmap.org/wiki/Global_Healthsites_Mapping_Project#Data_Model

- `osm_id`: some unique id

- `amenity`: facility type. Unique values include doctors, hospital, pharmacy, dentist and clinic.

- `healthcare`: A key to tag all places that provide healthcare (are part of the healthcare sector)

- `name`: name of health facility

- `operator`: The operator tag is used to name a company, corporation, person or any other entity who is directly in charge of the current operation of a map object.

- `source`: Used to indicate the source of information (i.e. meta data) added to OpenStreetMap

- `operator_ty`: operator type.

- `contact_num`: contact number

- `operational`: operational status (operational, non_operational, unknown). Used to document an observation of the current functional status of a mapped feature.

- `opening_hou`: opening hours. Describes when something is open or closed. There is a specific standard format for this data https://wiki.openstreetmap.org/wiki/Key:opening_hours/specification

- `beds`: Indicates the number of beds in a hotel or hospital
- `staff_docto`: Indicates the number of doctors in a hotel or hospital
- `staff_nurse`: Indicates the number of nurses in a hotel or hospital
- `health_amen`: healthcare equipment. Indicates what type of speciality medical equipment is available at the health facility.

- `dispensing`: yes/no. Whether a pharmacy dispenses prescription drugs or not. Used to add information to something that is already tagged as amenity=pharmacy

- `wheelchair`: yes/no. Used to mark places or ways that are suitable to be used with a wheelchair and a person with a disability who uses another mobility device (like a walker)

- `emergency`: yes/no. This key describes various emergency services.

- `insurance`: This key describes the type of health insurance accepted at the healthsite.

- `water_sourc`: Used to indicate the source of the water for features that provide or use water.

- `electricity`: Used to indicate the source of the power generated

- `is_in_healt`: yes/no variable. A tag to describe the level of the health division, working in a similar way as admin_level=\*. There are level 2, 4, 6, 8.

- `url`: Specifying a url related to a feature, in this case the wiki page if it is linked to an organised mapping effort, both through surveying and importing

- `addr_housen`: house number of address

- `addr_street`: street address

- `addr_postco`: post code.

- `addr_city`: city

- `changeset_i`: some id?

- `changeset_v`: integer. ?

- `changeset_t`: time where the change takes place

- `changeset_u`: user who made the change.

- `geometry`: geometry type and coordinates.

- `country`: column added to keep track of the country the data point belongs to.

In [7]:
# Are all values in osm_id unique?
df['osm_id'].nunique() == df.shape[0]

True

In [8]:
# Confirm the unique values of amenity match the data description
df['amenity'].unique()

array(['pharmacy', 'hospital', 'clinic', 'dentist', 'doctors', None],
      dtype=object)

# 3. Are they all populated?

No. Detailed attributes of the health facilities have around 90% values missing.

In [9]:
print("NA values by column")
pd.DataFrame({'count':df.isna().sum(), 'percentage':round(df.isna().sum() / df.shape[0],3) * 100})

NA values by column


Unnamed: 0,count,percentage
osm_id,0,0.0
amenity,1,0.3
healthcare,282,98.3
name,8,2.8
operator,171,59.6
source,109,38.0
speciality,161,56.1
operator_ty,286,99.7
contact_num,287,100.0
operational,283,98.6


# 4. Is there information on when the data was collected?  If so, what is the date range?

All datasets except for Zimbabwe are last updated on Fri Jun 19 2020 at the time of download. The dataset for Zimbabwe is last updated on Mon Jun 29 2020.

In [10]:
df['changeset_t_dt'] = pd.to_datetime(df['changeset_t'])
df['date'] = df['changeset_t_dt'].dt.date

print("date range by country:")
df.groupby(country_col).agg(start=('date', 'min'),
                            end=('date', 'max'))

date range by country:


Unnamed: 0_level_0,start,end
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Sierra Leone,2014-10-11,2020-05-17


# 5. How many data points have been collected overall?

In [11]:
print("Number of data points collected:", df.shape[0])

Number of data points collected: 287


# 6. How many of the facility names are empty, both null or no text?

In [12]:
print("Number of null values:", df[name_col].isna().sum())

Number of null values: 8


In [13]:
print("Number of no text values:", sum(df[name_col] == ""))

Number of no text values: 0


In [14]:
print("Number of data points:", df.shape[0])
print("Number of unique facility names:", df[name_col].nunique())

Number of data points: 287
Number of unique facility names: 262


# 7. What are the “types” used? 
Each health Facility is assigned a type such as Hospital, Health Facility, etc. Please list the unique types and count for each.

The facilities are classified in the same way for all countries. The unique values are clinic, dentist, doctors, hospital and pharmacy. However, certain facility types might not be present in certain countries.

In [21]:
print( "Number of unique facility types:", df[type_col].nunique())
print(df[type_col].value_counts())

Number of unique facility types: 5
clinic      154
pharmacy    112
hospital     16
dentist       3
doctors       1
Name: amenity, dtype: int64


# 8. Make a map of the dataset, how does the data look? Does it all fall within Africa?

It looks like all the data points all fall within the country. 

In [19]:
# Re-project to WGS84
df['geometry'] = df['geometry'].to_crs(epsg=4326)

In [18]:
def make_map(zoom_start, df):
    m = folium.Map(location=[8.542941, -11.783450], 
                   zoom_start=zoom_start, control_scale=True)

    # Create a Clustered map where points are clustered
    marker_cluster = MarkerCluster().add_to(m)
    # Create health facilities as points on top of the map
    for idx, row in df.iterrows():
        # Get lat and lon of points
        lon = row['geometry'].x
        lat = row['geometry'].y
        name = row[name_col]

        # Add marker to the map
        folium.Marker(location=[lat, lon], popup=name).add_to(m) 
    return m

In [20]:
make_map(zoom_start=7, df=df)

![](maps/Healthsites_13.1.png)

![](maps/Healthsites_13.2.png)