In [145]:
import numpy as np
import pandas as pd
import geopandas as gpd
import fiona
import os
import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [146]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

## Create the type dictionary

### Use HDX_WHO data

In [261]:
# data import
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Sub-Saharan_health_facilities"
folder = "Sub-Saharan_health_facilities.gdb"
path = os.path.join(dataDir, folder)

df_who = gpd.read_file(path, driver='FileGDB', 
                       layer='HDX_WHO_sub_saharan_health_facilities')
df_iss = gpd.read_file(path, driver='FileGDB', 
                       layer='ISS_sub_saharan')
df_iss.reset_index(inplace=True)

In [262]:
# obtain unique facility types for each country and the corresponding count
type_dict = pd.DataFrame({'count':df_who.groupby(['Country', 'Facility_t']).size()})
type_dict.reset_index(inplace=True)
# rename columns
type_dict.rename(columns={"Facility_t":'Type'}, inplace=True)

In [263]:
# correct inconsistent spelling in the `Type` column
type_corr_dict = {"Unites de Santé de village":"Unites de Santé de Village",
                 "Centre Médico-social":"Centre Médico-Social",
                 "Poste de santé":"Poste de Santé",
                 "Regional hospital":"Hôpital de Regional",
                 "Poste De Santé":"Poste de Santé",
                 "Hospital Medical Center":"Hospital Medical Centre",
                 "Health post":"Health Post", 
                  "DISPENSARY":"Dispensary",
                 "Natonal Hospital":"National Hospital",
                 "Level 1 Hospital":"Level I Hospital",
                 "Level 2 Hospital":"Level II Hospital",
                 "Level 3 Hospital":"Level III Hospital",
                 "Primary Health Care Unit +":"Primary Health Care Unit Plus",
                 "General Hospital Hospital":"General Hospital"}
type_dict['Type'] = [type_corr_dict[t] if t in type_corr_dict.keys() else t for\
                     t in type_dict['Type']]

In [264]:
# remove accent marks
type_dict['Type'] = [unidecode.unidecode(t) for t in type_dict['Type']]

In [265]:
# make spelling of the same word consistent
type_dict['Type'] = type_dict['Type']\
.str.replace('Center', 'Centre')\
.str.replace('Clinique', 'Clinic')\
.str.replace('Polyclinique', 'Polyclinic')\
.str.replace('Geral', 'General')\
.str.replace('Dispensaire', 'Dispensary')\
.str.replace('Hopital', 'Hospital')\
.str.replace('Hospitais', 'Hospital')\
.str.strip()


In [266]:
# obtain the abbreviation by extracting uppercase letters only
type_dict['Abbreviation'] = type_dict['Type'].str.replace(r'([^A-Z])', '')

In [267]:
# correct single letter abbreviations
convert_dict = {'Hospital':'HOSP', 'Clinic':'CLINIC',  
                "Polyclinic":"PCLINIC", "Dispensary":"DISP"}
abv = []
for idx, row in type_dict.iterrows():
    if row['Type'] in convert_dict.keys():
        abv.append(convert_dict[row['Type']])
    else:
        abv.append(row['Abbreviation'])
type_dict['Abbreviation'] = abv  

In [268]:
# rearrange the columns
type_dict=type_dict[['Country', 'Type', 'Abbreviation', 'count']]

In [269]:
# correct duplicate abbreviations
convert_dict = {"eSwatini":{"Clinic with Maternity":"CWM", "Clinic without Maternity":"CWOM",
                           "Referral Hospital":"RFH", "Regional Hospital":"RGH"},
                "Somalia":{"Referral Hospital":"RFH", "Regional Hospital":"RGH"},
                "Malawi":{"Central Hospital":"CEH", "Community Hospital":"COH"},
                "Gambia":{"Health Centre (major)":"HCMA", "Health Centre (minor)":"HCMI"},
                "Gabon":{"Hospital Cooperation":"HOSPC"},
                "Cameroon":{"Hospital Centraux":"HOSPC"},
                # corrections made based on examining ISS data
                "Uganda": {"Health Centre II": "HC II", "Health Centre III":"HC III",
                           "Health Centre IV":"HC IV"},
                "Mali": {"Community Health Centre":"CSCOM", 
                         "Referral Health Centre":"CSREF"},
                "Niger":{"Integrated Health Centre":"CSI"}
               }
abv = []
for idx, row in type_dict.iterrows():
    if row['Country'] in convert_dict.keys() and\
    row['Type'] in convert_dict[row['Country']].keys():
        abv.append(convert_dict[row['Country']][row['Type']])
    else:
        abv.append(row['Abbreviation'])
type_dict['Abbreviation'] = abv 

In [270]:
type_dict = type_dict.groupby(['Country', 'Type', 'Abbreviation']).agg(count=('count', 'sum'))
type_dict.reset_index(inplace=True)

### Identify additional prefix/suffix from ISS data

In [272]:
# re-encode country to match that in HDX_who data 
ctr_dict = {"CENTRAL_AFRICAN_REPUBLIC":'CENTRAL AFRICAN REPUBLIC',
            "GUINEA-BISSAU": 'GUINEA BISSAU',
            "RÉPUBLIQUE DÉMOCRATIQUE DU CONGO":'DEMOCRATIC REPUBLIC OF THE CONGO',
            "SIERRA":'SIERRA LEONE', 'SWAZILAND':'ESWATINI'}

df_iss['Country'] = [ctr_dict[c] if c in ctr_dict.keys() else c for c in df_iss['countries']]

In [273]:
# ensure every country in ISS is covered by HDX_who data 
iss_c = df_iss['Country'].unique()
who_c = df_who['Country'].unique()
who_c = [c.upper() for c in who_c]
for c in iss_c:
    if c not in who_c:
        print(c)

Criteria to select prefix/suffix

- length of name after splitting on whitespace is larger than 1
- the prefix/suffix 
    - has length between 2 and 4 (inclusive)
    - contains only alphabets
    - is not DR nor ST
    - does not contain vowels
    - count of appearances is larger than 100

Pre-cleaning on `name_of_facility_visited`.

In [274]:
df_iss['clean_name'] = df_iss['name_of_facility_visited'].str.replace("  ", " ")\
        .str.replace('.', '')\
        .str.replace('\s[-_,/\(\)]\s|[-_,/\(\)]\s|[-_,/\(\)]', ' ')\
        .str.strip()\
        .str.replace('center', 'centre', case=False)\
    .str.replace('Clinique', 'Clinic', case=False)\
    .str.replace('Polyclinique', 'Polyclinic', case=False)\
    .str.replace('Geral', 'General', case=False)\
    .str.replace('Dispensaire', 'Dispensary', case=False)\
    .str.replace('Hôpital', 'Hospital', case=False)\
    .str.replace('Hopital', 'Hospital', case=False)\
    .str.replace('Hospitais', 'Hospital', case=False)\
    .str.replace(' Hosp ', ' Hospital ', case=False)

# remove accent marks
df_iss['clean_name'] = [unidecode.unidecode(n) for n in df_iss['clean_name']]

In [275]:
# split clean name by whitespace
tmp = df_iss['clean_name'].str.split()

In [276]:
# prefix
vowels = set('AEIOU')
df_iss['prefix'] = [t[0].upper() if len(t)>=2 and len(t[0])>1 and len(t[0])<5 and t[0].isalpha()\
                    and t[0].upper() != 'ST' and t[0].upper() != 'DR' and \
                    vowels.isdisjoint(t[0].upper())\
                    else np.nan for t in tmp]

In [277]:
# count how many times that prefix appears
prefix_dict = df_iss.groupby(['Country','prefix'])\
                   .agg(count = ('name_of_facility_visited', 'count'))\
                    .sort_values(['Country', 'count'], ascending=[True, False])
# keep only prefixes that appear more than 100 times
prefix_dict = prefix_dict[prefix_dict['count']>100]
prefix_dict.reset_index(inplace=True)

In [278]:
# suffix
vowels = set('AEIOU')
df_iss['suffix'] = [t[-1].upper() if len(t)>=2 and len(t[-1])>1 and len(t[-1])<5 and t[-1].isalpha()\
                    and t[-1].upper() != 'ST' and t[-1].upper() != 'DR' and \
                    vowels.isdisjoint(t[-1].upper())\
                    else np.nan for t in tmp]

In [279]:
# count how many times that suffix appears
suffix_dict = df_iss.groupby(['Country','suffix'])\
                   .agg(count = ('name_of_facility_visited', 'count'))\
                    .sort_values(['Country', 'count'], ascending=[True, False])
# keep only suffixes that appear more than 100 times
suffix_dict = suffix_dict[suffix_dict['count']>100]
suffix_dict.reset_index(inplace=True)

In [280]:
# Build a dictionary of potentially additional abbreviations
add_abbrev = {}
for idx, row in prefix_dict.iterrows():
    # initialize a set if the country is not yet in the dictionary
    if row['Country'] not in add_abbrev.keys():
        add_abbrev[row['Country']] = set([row['prefix']])
    # otherwise add that prefix to the set
    else:
        add_abbrev[row['Country']].add(row['prefix'])
for idx, row in suffix_dict.iterrows():
    # initialize a set if the country is not yet in the dictionary
    if row['Country'] not in add_abbrev.keys():
        add_abbrev[row['Country']] = set([row['suffix']])
    # otherwise add that prefix to the set
    else:
        add_abbrev[row['Country']].add(row['suffix']) 

In [281]:
# Build a dictionary of additional abbreviations identified from ISS data
new = {}
for group_name in df_iss['Country'].unique():
    
    tmp = type_dict[type_dict['Country'].str.upper()==group_name]
    abbrevs = set(tmp['Abbreviation'])
    
    if group_name in add_abbrev.keys():
        add_prefix = add_abbrev[group_name]
        for p in add_prefix:
            if p not in abbrevs and group_name not in new.keys():
                new[group_name] = set([p])            
            if p not in abbrevs and group_name in new.keys():
                new[group_name].add(p)
print("New abbreviations added to the type dictionary:")
for ctr in new:
    print(ctr, new[ctr])

New abbreviations added to the type dictionary:
BENIN {'CS'}
BURUNDI {'CDS'}
CAMEROON {'CS'}
CHAD {'CS'}
GUINEA {'CS'}
GUINEA BISSAU {'CS'}
MOZAMBIQUE {'CS'}
NIGERIA {'PHCC', 'MDGS', 'NKST', 'MCH', 'HF', 'WCWC', 'CPHC', 'MCHC', 'FSP', 'PMV', 'PMS', 'TH', 'MDG'}


In [282]:
# Add the abbreviations to the type dictionary
additions = pd.DataFrame([['Benin', 'Centre de Sante', 'CS', np.nan],
                         ['Burundi', 'Centre de Sante', 'CDS', np.nan],
                         ['Cameroon', 'Centre de Sante', 'CS', np.nan],
                         ['Chad', 'Centre de Sante', 'CS', np.nan],
                         ['Guinea', 'Centre de Sante', 'CS', np.nan],
                         ['Guinea Bissau', 'Centre de Sante', 'CS', np.nan],
                         ['Guinea Bissau', 'Centro de Saude', 'CS2', np.nan],
                         ['Mozambique', 'Centro de Saude', 'CS', np.nan],
                         ['Nigeria', 'Traditional Spirit Healer', 'TH', np.nan],
                         ['Nigeria', 'Maternal and Child Health', 'MCH', np.nan],
                         ['Nigeria', 'Health Facility', 'HF', np.nan],
                         ['Nigeria', 'Family Support Program', 'FSP', np.nan],
                         ['Nigeria', 'Millennium Development Goal', 'MDG', np.nan],                                                  
                         ['Nigeria', 'Maternal and Child Health Centre', 'MCHC', np.nan],
                         ['Nigeria', 'Comprehensive Primary Health Care', 'CPHC', np.nan],                                                                           
                         ['Nigeria', 'Primary Health Care Centre', 'PHCC', np.nan],
                         ['Nigeria', 'Patent Medicine Vendor', 'PMV', np.nan],
                         ['Nigeria', 'Patent Medicine Store', 'PMS', np.nan],
                          # obtained by examining the ISS data
                         ['Central African Republic', 'Hospital District', 'HD', np.nan],
                         ['Liberia', 'General Hospital', 'CH', np.nan],
                         ['Guinea', 'Centre de Sante Urban', 'CSU', np.nan],
                         ['Guinea', 'Centre de Sante Rural', 'CSR', np.nan]],                                                                      
                         columns=['Country', 'Type', 'Abbreviation', 'count'])

type_dict = type_dict.append(additions)

In [284]:
type_dict.sort_values(by=['Country', 'Type'], inplace=True)

In [285]:
# Check for each country, every facility type has a unique abbreviation
tmp = type_dict.groupby('Country').agg(unique_type = ('Type', 'nunique'),
                                unique_abbrev = ('Abbreviation', 'nunique'))
tmp[tmp['unique_type']!=tmp['unique_abbrev']]

Unnamed: 0_level_0,unique_type,unique_abbrev
Country,Unnamed: 1_level_1,Unnamed: 2_level_1


In [286]:
# export results
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Cleaned"
filename="type_dict_1028.csv"
path = os.path.join(saveDir, filename)
type_dict.to_csv(path, index=False)

## ISS Cleaning

### clean_name_final

Use facility type and abbreviations in the type dictionary as keywords and remove type information from `clean_name` to create the `clean_name_final` column.

Sometimes there is a whitespace between abbreviation, e.g. C S. Corrections are made accordingly.

In [287]:
# obtain abbreviations of length 2
tmp = type_dict[type_dict['Abbreviation'].str.len()==2]['Abbreviation'].unique()
# change it to the pattern '^c s ' or 'c s$'
tmp_dict = {}
for t in tmp:
    tmp_dict[t] = ['^'+' '.join(list(t))+' ', ' '+' '.join(list(t))+'$']
# replace the pattern with 'cs'
for t in tmp:
    pats = tmp_dict[t]
    df_iss['clean_name'] = df_iss['clean_name'].str.replace(pats[0], t+' ',case=False)\
    .str.replace(pats[1], ' '+t,case=False)

Remove type information.

In [288]:
df_grouped = df_iss.groupby('Country')
res = pd.DataFrame()
type_keywords_dict = {}
abb_keywords_dict = {}

for group_name, df_group in df_grouped:
    # obtain the type dictionary for that country
    tmp = type_dict[type_dict['Country'].str.upper()==group_name]
    
    # facility types for that country
    types = list(tmp['Type'])
    type_keywords = set()
    for t in types:
        # add the facility type as it is and also the unidecoded version
        type_keywords.add(t)                 

        # add individual words as well
        t = t.replace('/', ' ')
        words = t.split(' ')
        # skip words that have punctuation and have length <= 3 (e.g. de, (major))
        words = [w for w in words if w.isalpha() and len(w)>3]
        for w in words:
            type_keywords.add(w)

    # obtain the list of type keywords and sort in descending length
    type_keywords = list(type_keywords)
    type_keywords = sorted(type_keywords, key=lambda s: -len(s))
    type_keywords_dict[group_name] = type_keywords

    # abbreviations for that country
    abbrevs = set(tmp['Abbreviation'])
        
    abb_keywords = []
    for abbrev in abbrevs:
        # e.g. for CS, possible matches include facility names that start with, end with CS or 
        # contains the string ' CS ' in between
        abb_keywords.extend(['^'+abbrev+'\s', '\s'+abbrev+'\s', '\s'+abbrev+'$',
                            '^'+abbrev+'$'])
        
    # obtain the list of abbreviation keywords and sort in descending length
    abb_keywords = sorted(abb_keywords, key=lambda s: -len(s))  
    abb_keywords_dict[group_name] = abb_keywords
    
    # replace double whitespaces with a single space
    df_group['clean_name_final'] = df_group['clean_name']\
        .str.replace('|'.join(type_keywords), '', case=False)\
        .str.replace('|'.join(abb_keywords), ' ', case=False)\
        .str.strip()\
        .str.replace('^de | de |^do | do |^da | da ', ' ', case=False)\
        .str.strip()\
        .str.replace("  ", " ")\
        .str.title()
    res = pd.concat([res, df_group])

In [289]:
res.shape

(305182, 47)

### extract_type
Extract facility type information

In [290]:
extract_types = []

for idx, row in res.iterrows():
    clean_name = row['clean_name'].upper()
    clean_name_final = row['clean_name_final'].upper()
    
    if clean_name.upper() == clean_name_final.upper():
        extract_types.append(np.nan)
    
    else:
        name_keywords = clean_name_final.split()
        type_keywords = []
        for w in clean_name.split():
            if w not in name_keywords:
                type_keywords.append(w)
            
        extract_type = ' '.join(type_keywords)
        extract_types.append(extract_type)
        
res['extract_type'] = extract_types
res['extract_type'] = res['extract_type'].str.strip()\
.str.replace('^de |^do |^da | de$| do$| da$', ' ', case=False)\
.str.strip()\
.str.replace('^de$|^do$|^da$', '', case=False)\
.str.replace("  ", " ")\
.str.strip()\
.replace('',np.nan)

In [291]:
print("Percentage of NA in extract type column:",
     round(res['extract_type'].isna().sum()/res.shape[0],3)*100)

Percentage of NA in extract type column: 17.0


Examine percentage of NA values in `extract_type` by country.

In [292]:
na_summ = pd.DataFrame()
df_grouped = res.groupby('Country')
for ctr, df_group in df_grouped:
    nas = df_group['extract_type'].isna().sum()
    n = df_group.shape[0]
    tmp = pd.DataFrame([[ctr,nas,n]], columns=['Country','#NA','N'])
    na_summ = pd.concat([na_summ, tmp])
na_summ['pertcentage_NA'] = na_summ['#NA']/na_summ['N'] 
na_summ = na_summ.sort_values(['pertcentage_NA'], ascending=False)
na_summ.reset_index(inplace=True,drop=True)
na_summ[na_summ['N']>1000].iloc[:10,:]

Unnamed: 0,Country,#NA,N,pertcentage_NA
0,MADAGASCAR,3648,3657,0.997539
3,MALAWI,1583,2152,0.735595
5,NIGER,1288,2010,0.640796
6,CHAD,5515,9900,0.557071
8,SOUTH SUDAN,2542,5279,0.481531
9,CENTRAL AFRICAN REPUBLIC,790,1729,0.456912
10,LIBERIA,1710,3877,0.441063
11,ETHIOPIA,1031,2414,0.427092
12,GABON,1451,3501,0.414453
14,GUINEA BISSAU,524,1393,0.376167


In [299]:
#type_dict[type_dict['Country'].str.upper()=='GUINEA']

In [300]:
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'extract_type', 'type_of_facility_visited']
#[(res['Country']=='MADAGASCAR')]
#(res['clean_name'].str.contains('^CPHC', na=False))
#res\
#[cols].sample(10)

Observations after examining randomly sampled rows where `extract_type` is NA:

type information not in `name_of_facility_visited`:
- MADAGASCAR
- MALAWI
- NIGER
- CHAD
- SOUTH SUDAN
- CENTRAL AFRICAN REPUBLIC
- LIBERIA
- ETHIOPIA (also, many names are OTHERS)
- GUINEA BISSAU	(some names have a prefix, but cannot infer the type) 

type information does not correspond to the type dictionary very well
- GABON

### sub_type

In [301]:
df_grouped = res.groupby('Country')
sub_types = []
scores = []
for group_name, df_group in df_grouped:
    tmp = type_dict[type_dict['Country'].str.upper()==group_name]
    types = tmp['Type']
    abbrevs = tmp['Abbreviation']
    isna = df_group['extract_type'].isna()
    for idx, row in df_group.iterrows():
        if isna.loc[idx]:
            sub_types.append(np.nan)
            scores.append(np.nan)
        else:

            match, score = process.extractOne(row['extract_type'], list(types)+list(abbrevs), 
                                           scorer = fuzz.ratio)
            scores.append(score)
            if match in list(abbrevs):
                sub_type = tmp[tmp['Abbreviation']==match]['Type'].iloc[0]
                sub_types.append(sub_type)
            else:
                sub_types.append(match) 
res['sub_type'] = sub_types
res['score'] = scores

In [156]:
#type_dict[type_dict['Country'].str.upper()=='SENEGAL']

In [302]:
res['score'].describe()

count    253429.000000
mean         97.178444
std           9.254319
min          33.000000
25%         100.000000
50%         100.000000
75%         100.000000
max         100.000000
Name: score, dtype: float64

In [303]:
cols = ['Country', 'name_of_facility_visited',
        'clean_name_final', 'extract_type', 'sub_type', 'score']
#[(res['Country']=='BURUNDI')]
#(res['clean_name'].str.contains('^CPHC', na=False))
#res['name_of_fa'].str.contains('\(')
res[res['score']<80]\
[cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,extract_type,sub_type,score
134301,NIGERIA,Rano Dawaki Health clinic,Rano Dawaki,HEALTH CLINIC,Health Facility,71.0
88975,NIGERIA,Dakwak Memorial Medical Clinic,Dakwak Memorial,MEDICAL CLINIC,Medical Centre,71.0
250561,NIGERIA,Makada Health Clinic,Makada,HEALTH CLINIC,Health Facility,71.0
2024,ANGOLA,Hospital municipal do NZETO,Nzeto,HOSPITAL MUNICIPAL,Hospital,62.0
297240,TANZANIA,NYAMAGANA HOSPITAL - DISTRICT HOSPITAL,Nyamagana,HOSPITAL DISTRICT HOSPITAL,District Hospital,79.0
268853,NIGERIA,Unguwar Sambo Health Clinic,Unguwar Sambo,HEALTH CLINIC,Health Facility,71.0
262679,NIGERIA,General hosp.Gezawa,Hospgezawa,GENERAL,General Hospital,61.0
51823,MALI,Centre de Santé de Référence,Sante Reference,CENTRE DE,Referral Health Centre,43.0
163886,NIGERIA,Maternity& child welfare clinic,Maternity& Welfare,CHILD CLINIC,Clinic,67.0
289241,SOUTH AFRICA,Tygerberg hospital,Tygerberg,HOSPITAL,District Hospital,64.0


In [304]:
# randomly sample 20 rows for each country for inspection
df_grouped = res.groupby('Country')
p = pd.DataFrame()
cols = ['Country', 'name_of_facility_visited', 'clean_name', 'clean_name_final', 
        'extract_type', 'sub_type', 'type_of_facility_visited']

for group_name, df_group in df_grouped:
    df_tmp = df_group[cols].sample(20)
    p = pd.concat([p, df_tmp])

In [305]:
p.to_csv(saveDir+"//clean_names_types_sampled_1028.csv", index=False)

In [306]:
cols= ['index', 'Country', 'clean_name', 'clean_name_final',
       'extract_type', 'sub_type']
res[cols].to_csv(saveDir+"//clean_names_types_full_1028.csv", index=False)