In [76]:
import numpy as np
import pandas as pd
import geopandas as gpd
import fiona
import os
import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from ordered_set import OrderedSet

In [77]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

In [78]:
# data import
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Sub-Saharan_health_facilities"
folder = "Sub-Saharan_health_facilities.gdb"
path = os.path.join(dataDir, folder)
df_iss = gpd.read_file(path, driver='FileGDB', 
                       layer='ISS_sub_saharan')
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Cleaned"
type_dict = pd.read_csv(saveDir + "//type_dict_1109.csv")
# get the index, for mapping processed data to original dataset
df_iss.reset_index(inplace=True)

# `clean_name`

Pre-cleaning on `name_of_facility_visited`:

- remove punctuations, change & to and
- correct spelling of common words
- replace double whitespaces with one and strip extra whitespaces
- remove accent marks

In [79]:
# pre-cleaning
df_iss['clean_name'] = df_iss['name_of_facility_visited'].str.strip()\
        .str.replace("  ", " ")\
        .str.replace('.', ' ')\
        .str.replace(':', ' ')\
        .str.replace("'", ' ')\
        .str.replace('"', ' ')\
        .str.replace('[-_,/\(\)]', ' ')\
        .str.replace('&', ' and ')\
        .str.strip()\
        .str.replace('center', 'centre', case=False)\
        .str.replace('Clinique', 'Clinic', case=False)\
        .str.replace('Polyclinique', 'Polyclinic', case=False)\
        .str.replace('Geral', 'General', case=False)\
        .str.replace('Dispensaire', 'Dispensary', case=False)\
        .str.replace('Hôpital', 'Hospital', case=False)\
        .str.replace('Hopital', 'Hospital', case=False)\
        .str.replace('Hospitais', 'Hospital', case=False)\
        .str.replace(' Hosp | hosp$', ' Hospital ', case=False)\
        .str.replace("Urbain", "Urban", case=False)\
        .str.replace("Distrital", "District", case=False)\
        .str.replace("  ", " ")\
        .str.strip()

# remove accent marks
df_iss['clean_name'] = [unidecode.unidecode(n) for n in df_iss['clean_name']]

In [80]:
# Ensure punctuations at the beginning are removed
df_iss[['name_of_facility_visited','clean_name']].sort_values('name_of_facility_visited').head(2)

Unnamed: 0,name_of_facility_visited,clean_name
181818,"""MCH Gwio Kura",MCH Gwio Kura
277705,'CM la Paix,CM la Paix


# `clean_name_final`

Use facility type and abbreviations in the type dictionary as keywords and remove type information from `clean_name` to create the `clean_name_final` column.

Sometimes there is a whitespace between abbreviation, e.g. C S. Corrections are made accordingly.

In [81]:
# obtain abbreviations of length 2 or 3
tmp = type_dict[type_dict['Abbreviation'].str.len()<=3]['Abbreviation'].unique()
tmp = sorted(tmp, key=len, reverse=True)
# change it to the pattern '^c s ' or 'c s$'
tmp_dict = {}
for t in tmp:
    tmp_dict[t] = ['^'+' '.join(list(t))+' ', ' '+' '.join(list(t))+'$']
# replace the pattern with 'cs'
for t in tmp:
    pats = tmp_dict[t]
    df_iss['clean_name'] = df_iss['clean_name'].str.replace(pats[0], t+' ',case=False)\
    .str.replace(pats[1], ' '+t, case=False)

Remove type information.

In [82]:
# re-encode country to match that in HDX_who data 
ctr_dict = {"CENTRAL_AFRICAN_REPUBLIC":'CENTRAL AFRICAN REPUBLIC',
            "GUINEA-BISSAU": 'GUINEA BISSAU',
            "RÉPUBLIQUE DÉMOCRATIQUE DU CONGO":'DEMOCRATIC REPUBLIC OF THE CONGO',
            "SIERRA":'SIERRA LEONE', 'SWAZILAND':'ESWATINI'}

df_iss['Country'] = [ctr_dict[c] if c in ctr_dict.keys() else c for c in df_iss['countries']]

In [83]:
# extract_type
df_grouped = df_iss.groupby('Country')
res = pd.DataFrame()
type_keywords_dict = {}
abb_keywords_dict = {}

for group_name, df_group in df_grouped:
    # obtain the type dictionary for that country
    tmp = type_dict[type_dict['Country'].str.upper()==group_name]
    
    # facility types for that country
    types = list(tmp['Type'])
    type_keywords = set()
    for t in types:
        # add the full facility type 
        t = t.title()
        type_keywords.add(t)                 

        # add individual words as well
        t = t.replace('/', ' ')
        words = t.split(' ')
        # skip words that have punctuation and have length <= 3 (e.g. de, (major))
        words = [w for w in words if w.isalpha() and len(w)>3]
        for w in words:
            type_keywords.add(w)

    # obtain the list of type keywords and sort in descending length
    type_keywords = list(type_keywords)
    type_keywords = sorted(type_keywords, key=lambda s: -len(s))
    type_keywords_dict[group_name] = type_keywords

    # abbreviations for that country
    abbrevs = set(tmp['Abbreviation'])
        
    abb_keywords = []
    for abbrev in abbrevs:
        # e.g. for CS, 4 patterns are considered: '^CS ', ' CS ', ' CS$', '^CS$'
        abbrev = abbrev.title()
        abb_keywords.extend(['^'+abbrev+'\s', '\s'+abbrev+'\s', '\s'+abbrev+'$',
                            '^'+abbrev+'$'])
        
    # obtain the list of abbreviation keywords and sort in descending length
    abb_keywords = sorted(abb_keywords, key=lambda s: -len(s))  
    abb_keywords_dict[group_name] = abb_keywords

    # some country-specific adjustments
    if group_name == 'UGANDA':
        df_group['clean_name'] = df_group['clean_name'].str.replace("HC II$", "HCII", case=False)\
        .str.replace("HC III$", "HCIII", case=False)\
        .str.replace("HC IV$", "HCIV", case=False)
        
    if group_name == 'MALAWI':
        df_group['clean_name'] = df_group['clean_name'].str.replace(" DHO$", " DH", case=False)

    if group_name == "ERITREA":
        df_group['clean_name'] = df_group['clean_name'].str.replace(" HO$", " HOSP", case=False)
        
    if group_name == 'MADAGASCAR':
        df_group['clean_name'] = df_group['clean_name'].str.replace("csb 1", " csb1", case=False)
        df_group['clean_name'] = df_group['clean_name'].str.replace("csb 2", " csb2", case=False)

    # handle situations when type is 'Hospital District' in the type dictionary 
    # but name column has 'District Hospital' in ISS data
    type_len_2 = [t for t in type_keywords if len(t.split())==2]
    for t in type_len_2:
        df_group['clean_name'] = df_group['clean_name'].str.title()\
        .str.replace(' '.join(t.split()[::-1]), t, case=False)
                           
    # replace double whitespaces with a single space
    df_group['clean_name_final'] = df_group['clean_name'].str.title()\
        .str.replace('|'.join(type_keywords), '')\
        .str.replace('|'.join(abb_keywords), '')\
        .str.strip()\
        .str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
                     '', case=False)\
        .str.replace("  ", " ")\
        .str.strip()\
        .str.title()
    res = pd.concat([res, df_group])

In [84]:
res.shape

(305182, 45)

In [85]:
# randomly sample rows to examine results
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'type_of_facility_visited']
res[cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,type_of_facility_visited
185062,NIGERIA,HC DORAYI,Hc Dorayi,Dorayi,PHC_CENTER
287708,SIERRA LEONE,Nianyahun,Nianyahun,Nianyahun,PHC_CENTER
112000,NIGERIA,Bara health clinic,Bara Health Clinic,Bara,PHC_CENTER
167047,NIGERIA,PHCC Salasa,Phcc Salasa,Salasa,PHC_CENTER
52336,MALI,Csref de Sefeto,Csref De Sefeto,Sefeto,CSI


# `extract_type`

Extract facility type information by removing `clean_name_final` from `clean_name`.

In [86]:
extract_types = []

for idx, row in res.iterrows():
    clean_name = row['clean_name'].upper()
    clean_name_final = row['clean_name_final'].upper()
    
    # if clean_name_final is exactly the same as clean_name,
    # this indicates no type information can be extracted, thus append NA
    if clean_name.upper() == clean_name_final.upper():
        extract_types.append(np.nan)
    
    else:
        clean_name = OrderedSet(clean_name.split())
        clean_name_final = OrderedSet(clean_name_final.split())
        # find the difference between two names
        extract_type = ' '.join(list(clean_name.difference(clean_name_final)))
        extract_types.append(extract_type.strip())

# remove de, do, da, du at start or end of extract_type
# replace empty string with NA
res['extract_type'] = extract_types
res['extract_type'] = res['extract_type'].str.strip()\
.str.replace("  ", " ")\
.str.replace('^de |^do |^da |^du | du$| de$| do$| da$|^de$|^do$|^da$|^du$', '', case=False)\
.str.replace('^de |^do |^da |^du | du$| de$| do$| da$|^de$|^do$|^da$|^du$', '', case=False)\
.str.strip()\
.str.title()\
.replace('',np.nan)

In [87]:
# randomly sample rows to examine results
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'extract_type', 'type_of_facility_visited']
res[cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,extract_type,type_of_facility_visited
219141,NIGERIA,HC KQMANZO,Hc Kqmanzo,Kqmanzo,Hc,PHC_CENTER
162326,NIGERIA,Lawan Barma,Lawan Barma,Lawan Barma,,PATENT_MED_VENDORS
36127,GHANA,Tatindo clinic,Tatindo Clinic,Tatindo,Clinic,PRIVATE_FACILITY
24380,CHAD,Doba ACT,Doba Act,Doba Act,,OTHER_NON_ORTHORDOX_HC
193894,NIGERIA,PHC TUKUR TUKUR,Phc Tukur Tukur,Tukur Tukur,Phc,PHC_CENTER


In [88]:
print("Percentage of NA in extract type column:",
     round(res['extract_type'].isna().sum()/res.shape[0]*100,1))
print("Number of NA values in extract type column:", res[pd.isna(res['extract_type'])].shape[0])

Percentage of NA in extract type column: 16.2
Number of NA values in extract type column: 49547


# `sub_type`

Use `extract_type` to map the type information extracted from the name column to one of the types in the type dictionary.

In [89]:
df_grouped = res.groupby('Country')
res_sub_type = pd.DataFrame()
for country_name in res['Country'].unique():
    df_group = res[res['Country']==country_name]
    # obtain facility types and abbreviations for that country
    tmp = type_dict[type_dict['Country'].str.upper()==country_name]
    types = tmp['Type']
    abbrevs = tmp['Abbreviation']
    sub_types = []
    scores = []
    
    for idx, row in df_group.iterrows():
        # if extract_type is NA, just append NA
        if not isinstance(row['extract_type'],str):
            sub_types.append(np.nan)
            scores.append(np.nan)
        
        # find best match
        else:
            match, score = process.extractOne(row['extract_type'], list(types)+list(abbrevs), 
                                           scorer = fuzz.ratio)
            scores.append(score)
            # if best match is abbreviation, map it to the corresponding type
            if match in list(abbrevs):
                sub_type = tmp[tmp['Abbreviation']==match]['Type'].iloc[0]
                sub_types.append(sub_type)
            else:
                sub_types.append(match) 
    df_group['sub_type'] = sub_types
    df_group['score'] = scores
    res_sub_type = pd.concat([res_sub_type, df_group])

In [90]:
res_sub_type.shape

(305182, 48)

In [91]:
# replace empty string with NA
res_sub_type['clean_name_final'].replace('', np.nan, inplace=True)

In [92]:
res_sub_type['score'].describe()

count    255635.000000
mean         97.124603
std           9.920280
min          29.000000
25%         100.000000
50%         100.000000
75%         100.000000
max         100.000000
Name: score, dtype: float64

In [93]:
# randomly sample rows to examine results
cols = ['Country', 'name_of_facility_visited',
        'clean_name_final', 'extract_type', 'sub_type', 'score', 'type_of_facility_visited']
res_sub_type[cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,extract_type,sub_type,score,type_of_facility_visited
255531,NIGERIA,Doro PHC,Doro,Phc,Primary Health Centre,100.0,PHC_CENTER
287319,SIERRA LEONE,MI Room Murray town CHP,Mi Room Murray Town,Chp,Community Health Post,100.0,PHC_CENTER
232703,NIGERIA,OTHER,Other,,,,PHC_CENTER
40034,KENYA,EAST LAIKIPIA DISPENSARY,East Laikipia,Dispensary,Dispensary,100.0,DISPENSARY
6793,CAMEROON,Hopital Mada,Mada,Hospital,Hospital,100.0,DISTRICT_HOSP


In [94]:
# randomly sample 20 rows for each country for inspection
df_grouped = res_sub_type.groupby('Country')
p = pd.DataFrame()
cols = ['Country', 'name_of_facility_visited', 'clean_name', 'clean_name_final', 
        'extract_type', 'sub_type', 'type_of_facility_visited']
for group_name, df_group in df_grouped:
    df_tmp = df_group[cols].sample(20)
    p = pd.concat([p, df_tmp])

In [95]:
p.to_csv(saveDir+"//clean_names_types_sampled_1109.csv", index=False)

In [96]:
# export results
cols= ['index', 'Country', 'clean_name', 'clean_name_final',
       'extract_type', 'sub_type']
res_sub_type[cols].to_csv(saveDir+"//clean_names_types_1109.csv", index=False)