In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import fiona
import os
import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from ordered_set import OrderedSet
pd.set_option('mode.chained_assignment', None)

In [2]:
# import data
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\DRC"
df = gpd.read_file(dataDir+"\\DRC_HF_acasus_iss.gdb", driver='FileGDB', 
                       layer= 'DRC_Acasus_health_facility_all_merged')

# get the index, for mapping processed data to original dataset
df.reset_index(inplace=True)

# import type dictionary
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Cleaned"
TYPE_DICT = pd.read_csv(saveDir + "//type_dict_1109.csv")

Note: the country column in the dataset must match that in the type dictionary, ignoring cases.

In [4]:
print("Country names in the type dictionary:")
print(TYPE_DICT['Country'].unique())

Country names in the type dictionary:
['Angola' 'Benin' 'Botswana' 'Burkina Faso' 'Burundi' 'Cameroon'
 'Cape Verde' 'Central African Republic' 'Chad' 'Comoros' 'Congo'
 "Cote d'Ivoire" 'Democratic Republic of the Congo' 'Djibouti'
 'Equatorial Guinea' 'Eritrea' 'Ethiopia' 'Gabon' 'Gambia' 'Ghana'
 'Guinea' 'Guinea Bissau' 'Kenya' 'Lesotho' 'Liberia' 'Madagascar'
 'Malawi' 'Mali' 'Mauritania' 'Mauritius' 'Mozambique' 'Namibia' 'Niger'
 'Nigeria' 'Rwanda' 'Sao Tome and Principe' 'Senegal' 'Seychelles'
 'Sierra Leone' 'Somalia' 'South Africa' 'South Sudan' 'Sudan' 'Tanzania'
 'Togo' 'Uganda' 'Zambia' 'Zanzibar' 'Zimbabwe' 'eSwatini']


In [11]:
# INPUT
# facility name column
FACILITY_NAME = 'face_name1'
# country column
COUNTRY = 'Country'

# OUTPUT
# output columns
CLEAN_NAME = 'clean_name' # clean name after some pre-cleaning
CLEAN_NAME_FINAL = 'clean_name_final' # final clean name after removing type information
EXTRACT_TYPE = 'type_extract' # type information extracted
SUB_TYPE = 'sub_type' # type mapped to the type dictonary
SCORE = 'score' # match score between 'type_extract' and 'score'

# path to save cleaned results
SAVE_PATH = "cleaned_results.csv"

# Define functions

## `clean_name`

Pre-cleaning on facility name:

- remove punctuations, change & to and
- correct spelling of common words
- replace double whitespaces with one and strip extra whitespaces
- remove accent marks

In [12]:
def preclean(df, facility_name = FACILITY_NAME, clean_name = CLEAN_NAME):
    df[clean_name] = df[facility_name].str.strip()\
            .str.replace("  ", " ")\
            .str.replace('.', ' ')\
            .str.replace(':', ' ')\
            .str.replace("'", ' ')\
            .str.replace('"', ' ')\
            .str.replace('[-_,/\(\)]', ' ')\
            .str.replace('&', ' and ')\
            .str.strip()\
            .str.replace('center', 'centre', case=False)\
            .str.replace('Clinique', 'Clinic', case=False)\
            .str.replace('Polyclinique', 'Polyclinic', case=False)\
            .str.replace('Geral', 'General', case=False)\
            .str.replace('Dispensaire', 'Dispensary', case=False)\
            .str.replace('Hôpital', 'Hospital', case=False)\
            .str.replace('Hopital', 'Hospital', case=False)\
            .str.replace('Hospitais', 'Hospital', case=False)\
            .str.replace(' Hosp | hosp$', ' Hospital ', case=False)\
            .str.replace("Urbain", "Urban", case=False)\
            .str.replace("Distrital", "District", case=False)\
            .str.replace("  ", " ")\
            .str.strip()

    # remove accent marks
    df[clean_name] = [unidecode.unidecode(n) for n in df[clean_name]]

## `clean_name_final`

Use facility type and abbreviations in the type dictionary as keywords and remove type information from `clean_name` to create the `clean_name_final` column.

In [13]:
def remove_type_info(df, type_dict, clean_name, clean_name_final, country):
    # remove whitespace between abbreviations of length 2 or 3
    # e.g. change C S to CS
    
    # obtain abbreviations of length 2 or 3
    tmp = type_dict[type_dict['Abbreviation'].str.len()<=3]['Abbreviation'].unique()
    # sort by decreasing length
    tmp = sorted(tmp, key=len, reverse=True)
    # change it to the pattern '^c s ' or ' c s$'
    tmp_dict = {}
    for t in tmp:
        tmp_dict[t] = ['^'+' '.join(list(t))+' ', ' '+' '.join(list(t))+'$']
    # replace the pattern with 'cs'
    for t in tmp:
        pats = tmp_dict[t]
        df[clean_name] = df[clean_name].str.replace(pats[0], t+' ',case=False)\
        .str.replace(pats[1], ' '+t, case=False)
        
    # remove type information
    df_grouped = df.groupby(country)
    res = pd.DataFrame()

    for group_name, df_group in df_grouped:
        # obtain the type dictionary for that country
        tmp = type_dict[type_dict['Country'].str.upper()==group_name.upper()]

        # facility types for that country
        types = list(tmp['Type'])
        type_keywords = set()
        for t in types:
            # add the full facility type 
            t = t.title()
            type_keywords.add(t)                 

            # add individual words as well
            t = t.replace('/', ' ')
            words = t.split(' ')
            # skip words that have punctuation / numbers and have length <= 3 (e.g. de, (major))
            words = [w for w in words if w.isalpha() and len(w)>3]
            for w in words:
                type_keywords.add(w)

        # obtain the list of type keywords and sort in descending length
        type_keywords = list(type_keywords)
        type_keywords = sorted(type_keywords, key=lambda s: -len(s))

        # abbreviations for that country
        abbrevs = set(tmp['Abbreviation'])

        abb_keywords = []
        for abbrev in abbrevs:
            # e.g. for CS, 4 patterns are considered: '^CS ', ' CS ', ' CS$', '^CS$'
            abbrev = abbrev.title()
            abb_keywords.extend(['^'+abbrev+'\s', '\s'+abbrev+'\s', '\s'+abbrev+'$',
                                '^'+abbrev+'$'])

        # obtain the list of abbreviation keywords and sort in descending length
        abb_keywords = sorted(abb_keywords, key=lambda s: -len(s))  

        # some country-specific adjustments
        if group_name == 'UGANDA':
            df_group['clean_name'] = df_group['clean_name'].str.replace("HC II$", "HCII", case=False)\
            .str.replace("HC III$", "HCIII", case=False)\
            .str.replace("HC IV$", "HCIV", case=False)

        if group_name == 'MALAWI':
            df_group['clean_name'] = df_group['clean_name'].str.replace(" DHO$", " DH", case=False)

        if group_name == "ERITREA":
            df_group['clean_name'] = df_group['clean_name'].str.replace(" HO$", " HOSP", case=False)

        if group_name == 'MADAGASCAR':
            df_group['clean_name'] = df_group['clean_name'].str.replace("csb 1", " csb1", case=False)
            df_group['clean_name'] = df_group['clean_name'].str.replace("csb 2", " csb2", case=False)

        # handle situations when type is 'Hospital District' in the type dictionary 
        # but name column has 'District Hospital' in ISS data
        type_len_2 = [t for t in type_keywords if len(t.split())==2]
        for t in type_len_2:
            df_group[clean_name] = df_group[clean_name].str.title()\
            .str.replace(' '.join(t.split()[::-1]), t, case=False)

        # remove type information using keywords generated above
        # remove meaningless connecting words like de, do, da, du
        df_group[clean_name_final] = df_group[clean_name].str.title()\
            .str.replace('|'.join(type_keywords), '')\
            .str.replace('|'.join(abb_keywords), ' ')\
            .str.strip()\
            .str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
                         ' ', case=False)\
            .str.replace("  ", " ")\
            .str.strip()\
            .str.title()
        res = pd.concat([res, df_group])
    return res

## `extract_type`

Extract facility type information by removing `clean_name_final` from `clean_name`.

In [14]:
def extract_type(df, clean_name, clean_name_final, extract_type):
    extract_types = []

    for idx, row in df.iterrows():
        name = row[clean_name].upper()
        name_final = row[clean_name_final].upper()

        # if clean_name_final is exactly the same as clean_name,
        # this indicates no type information can be extracted, thus append NA
        if name.upper() == name_final.upper():
            extract_types.append(np.nan)

        else:
            name = OrderedSet(name.split())
            name_final = OrderedSet(name_final.split())
            # find the difference between two names
            diff = ' '.join(list(name.difference(name_final)))
            extract_types.append(diff.strip())

    # remove de, do, da, du at start or end of extract_type
    # replace empty string with NA
    df[extract_type] = extract_types
    df[extract_type] = df[extract_type].str.strip()\
        .str.replace("  ", " ")\
        .str.replace('^de |^do |^da |^du | du$| de$| do$| da$|^de$|^do$|^da$|^du$', '', case=False)\
        .str.replace('^de |^do |^da |^du | du$| de$| do$| da$|^de$|^do$|^da$|^du$', '', case=False)\
        .str.strip()\
        .str.title()\
        .replace('',np.nan)
    # replace empty string with NA
    df['clean_name_final'].replace('', np.nan, inplace=True)

## `sub_type`

Use `extract_type` to map the type information extracted from the name column to one of the types in the type dictionary.

In [15]:
def map_type(df, country, extract_type, sub_type, score, type_dict):
    df_grouped = df.groupby(country)
    res = pd.DataFrame()
    for country_name in df[country].unique():
        df_group = df[df[country]==country_name]
        # obtain facility types and abbreviations for that country
        tmp = type_dict[type_dict['Country'].str.upper()==country_name.upper()]
        types, abbrevs = tmp['Type'], tmp['Abbreviation']
        sub_types = []
        scores = []

        for idx, row in df_group.iterrows():
            # if extract_type is NA, just append NA
            if not isinstance(row[extract_type],str):
                sub_types.append(np.nan)
                scores.append(np.nan)

            # find best match
            else:
                match, match_score = process.extractOne(row[extract_type], list(types)+list(abbrevs), 
                                               scorer = fuzz.ratio)
                scores.append(match_score)
                # if best match is abbreviation, map it to the corresponding type
                if match in list(abbrevs):
                    match_type = tmp[tmp['Abbreviation']==match]['Type'].iloc[0]
                    sub_types.append(match_type)
                else:
                    sub_types.append(match) 
        df_group[sub_type] = sub_types
        df_group[score] = scores
        res = pd.concat([res, df_group])
    return res

In [16]:
def export_results(df, save_path):
    # export results
    # index_original could be used to map results to original dataset
    df.rename(columns={'index':'index_original'}, inplace=True)
    df.to_csv(save_path, index=False)

# Apply cleaning functions

In [17]:
# pre-cleaning
preclean(df, facility_name = FACILITY_NAME, clean_name = CLEAN_NAME)

In [19]:
# remove type information
res = remove_type_info(df, type_dict=TYPE_DICT, clean_name=CLEAN_NAME, 
                       clean_name_final=CLEAN_NAME_FINAL, country=COUNTRY)

In [22]:
# obtain facility type extracted
extract_type(df=res, clean_name=CLEAN_NAME, 
             clean_name_final=CLEAN_NAME_FINAL, extract_type=EXTRACT_TYPE)

In [24]:
print("Percentage of NA in extract type column:",
     round(res[EXTRACT_TYPE].isna().sum()/res.shape[0]*100,1))
print("Number of NA values in extract type column:", res[pd.isna(res[EXTRACT_TYPE])].shape[0])

Percentage of NA in extract type column: 2.1
Number of NA values in extract type column: 811


In [27]:
# map facility type extracted to type in type dictionary
res = map_type(df=res, country = COUNTRY, extract_type=EXTRACT_TYPE, 
               sub_type=SUB_TYPE, score=SCORE, type_dict=TYPE_DICT)

In [34]:
print("Summary statistics of match score:")
res[SCORE].describe()

Summary statistics of match score:


count    37468.000000
mean        99.085406
std          4.396509
min         49.000000
25%        100.000000
50%        100.000000
75%        100.000000
max        100.000000
Name: score, dtype: float64

In [33]:
# randomly sample rows to examine results
cols = [COUNTRY, FACILITY_NAME, CLEAN_NAME, CLEAN_NAME_FINAL, EXTRACT_TYPE,
       SUB_TYPE, SCORE]
res[cols].sample(5)

Unnamed: 0,Country,face_name1,clean_name,clean_name_final,type_extract,sub_type,type,score
37051,Democratic Republic of the Congo,ks Muanyika Centre de Santé de Référence,Ks Muanyika Centre De Sante De Reference,Ks Muanyika,Centre De Sante Reference,Centre de Sante de Reference,Centre de Sante de Reference,94.0
18850,Democratic Republic of the Congo,kn Menkao Centre de Santé,Kn Menkao Centre De Sante,Kn Menkao,Centre De Sante,Centre de Sante,Centre de Sante,100.0
21928,Democratic Republic of the Congo,it Lea Centre de Santé,It Lea Centre De Sante,It Lea,Centre De Sante,Centre de Sante,Centre de Sante,100.0
27234,Democratic Republic of the Congo,hk Consolate Centre de Santé,Hk Consolate Centre De Sante,Hk Consolate,Centre De Sante,Centre de Sante,Centre de Sante,100.0
20723,Democratic Republic of the Congo,kn ONATRA/Kalamu1 Centre Hospitalier,Kn Onatra Kalamu1 Centre Hospitalier,Kn Onatra Kalamu1 Ier,Centre Hospitalier,Centre Hospital,Centre Hospitalier,91.0


In [117]:
# export results
export_results(res, save_path=SAVE_PATH)