In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import fiona
import os
import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from ordered_set import OrderedSet

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

# Use HDX_WHO data

In [3]:
# data import
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Sub-Saharan_health_facilities"
folder = "Sub-Saharan_health_facilities.gdb"
path = os.path.join(dataDir, folder)
df_who = gpd.read_file(path, driver='FileGDB', 
                       layer='HDX_WHO_sub_saharan_health_facilities')
df_iss = gpd.read_file(path, driver='FileGDB', 
                       layer='ISS_sub_saharan')
# get the index, for mapping processed data to original dataset
df_iss.reset_index(inplace=True)

In [4]:
# obtain unique facility types for each country and the corresponding count
type_dict = pd.DataFrame({'count':df_who.groupby(['Country', 'Facility_t']).size()})
type_dict.reset_index(inplace=True)
# rename columns
type_dict.rename(columns={"Facility_t":'Type'}, inplace=True)

In [5]:
# correct inconsistent spelling in the `Type` column
type_corr_dict = {"Unites de Santé de village":"Unites de Santé de Village",
                 "Centre Médico-social":"Centre Médico-Social",
                 "Poste de santé":"Poste de Santé",
                 "Regional hospital":"Hôpital de Regional",
                 "Poste De Santé":"Poste de Santé",
                 "Hospital Medical Center":"Hospital Medical Centre",
                 "Health post":"Health Post", 
                 "DISPENSARY":"Dispensary",
                 "Natonal Hospital":"National Hospital",
                 "Level 1 Hospital":"Level I Hospital",
                 "Level 2 Hospital":"Level II Hospital",
                 "Level 3 Hospital":"Level III Hospital",
                 "Primary Health Care Unit +":"Primary Health Care Unit Plus",
                 "General Hospital Hospital":"General Hospital",
                 "Health Centre (major)":"Health Centre",
                 "Health Centre (minor)":"Health Centre"}
type_dict['Type'] = [type_corr_dict[t] if t in type_corr_dict.keys() else t for\
                     t in type_dict['Type']]

In [6]:
# remove accent marks
type_dict['Type'] = [unidecode.unidecode(t) for t in type_dict['Type']]

In [7]:
# make spelling of the same word consistent
# the same pre-cleaning will be applied to facility name column in ISS data
type_dict['Type'] = type_dict['Type']\
.str.replace('Center', 'Centre')\
.str.replace('Clinique', 'Clinic')\
.str.replace('Polyclinique', 'Polyclinic')\
.str.replace('Geral', 'General')\
.str.replace('Dispensaire', 'Dispensary')\
.str.replace('Hopital', 'Hospital')\
.str.replace('Hospitais', 'Hospital')\
.str.replace("Urbain", "Urban")\
.str.replace("Distrital", "District")\
.str.replace('&', 'and')\
.str.strip()

In [8]:
# obtain the abbreviation by extracting uppercase letters only
type_dict['Abbreviation'] = type_dict['Type'].str.replace(r'([^A-Z])', '')

In [9]:
# correct single letter abbreviations
convert_dict = {'Hospital':'HOSP', 'Clinic':'CLINIC',  
                "Polyclinic":"PCLINIC", "Dispensary":"DISP"}
abv = []
for idx, row in type_dict.iterrows():
    if row['Type'] in convert_dict.keys():
        abv.append(convert_dict[row['Type']])
    else:
        abv.append(row['Abbreviation'])
type_dict['Abbreviation'] = abv  

In [10]:
# rearrange the columns
type_dict=type_dict[['Country', 'Type', 'Abbreviation', 'count']]

In [11]:
# adjust some abbreviations
convert_dict = {# correct duplicate abbreviations
                "eSwatini":{"Clinic with Maternity":"CWM", "Clinic without Maternity":"CWOM",
                           "Referral Hospital":"RFH", "Regional Hospital":"RGH"},
                "Somalia":{"Referral Hospital":"RFH", "Regional Hospital":"RGH"},
                "Malawi":{"Central Hospital":"CEH", "Community Hospital":"COH"},
                "Gabon":{"Hospital Cooperation":"HOSPC"},
                "Cameroon":{"Hospital Centraux":"HOSPC"},
                # corrections made based on examining ISS data
                "Mali": {"Community Health Centre":"CSCOM", 
                         "Referral Health Centre":"CSREF"},
                "Niger":{"Integrated Health Centre":"CSI"},
                "Equatorial Guinea":{"Regional Hospital":"HR",
                                    "District Hospital":"HD"}
               }
abv = []
for idx, row in type_dict.iterrows():
    if row['Country'] in convert_dict.keys() and\
    row['Type'] in convert_dict[row['Country']].keys():
        abv.append(convert_dict[row['Country']][row['Type']])
    else:
        abv.append(row['Abbreviation'])
type_dict['Abbreviation'] = abv 

In [12]:
# combine new results
type_dict = type_dict.groupby(['Country', 'Type', 'Abbreviation']).agg(count=('count', 'sum'))
type_dict.reset_index(inplace=True)

# Identify additional prefix/suffix from ISS data

In [13]:
# re-encode country to match that in HDX_who data 
ctr_dict = {"CENTRAL_AFRICAN_REPUBLIC":'CENTRAL AFRICAN REPUBLIC',
            "GUINEA-BISSAU": 'GUINEA BISSAU',
            "RÉPUBLIQUE DÉMOCRATIQUE DU CONGO":'DEMOCRATIC REPUBLIC OF THE CONGO',
            "SIERRA":'SIERRA LEONE', 'SWAZILAND':'ESWATINI'}

df_iss['Country'] = [ctr_dict[c] if c in ctr_dict.keys() else c for c in df_iss['countries']]

In [14]:
# ensure every country in ISS is covered by HDX_who data 
iss_c = df_iss['Country'].unique()
who_c = df_who['Country'].unique()
who_c = [c.upper() for c in who_c]
for c in iss_c:
    if c not in who_c:
        print(c)

Criteria to select prefix/suffix

- length of name after splitting on whitespace is larger than 1
- the prefix/suffix 
    - has length between 2 and 4 (inclusive)
    - contains only alphabets
    - is not DR nor ST
    - does not contain vowels
    - count of appearances is larger than 100

## Pre-cleaning on `name_of_facility_visited`

In [15]:
# pre-cleaning
df_iss['clean_name'] = df_iss['name_of_facility_visited'].str.strip()\
        .str.replace("  ", " ")\
        .str.replace('.', ' ')\
        .str.replace(':', ' ')\
        .str.replace("'", ' ')\
        .str.replace('"', ' ')\
        .str.replace('[-_,/\(\)]', ' ')\
        .str.replace('&', ' and ')\
        .str.strip()\
        .str.replace('center', 'centre', case=False)\
        .str.replace('Clinique', 'Clinic', case=False)\
        .str.replace('Polyclinique', 'Polyclinic', case=False)\
        .str.replace('Geral', 'General', case=False)\
        .str.replace('Dispensaire', 'Dispensary', case=False)\
        .str.replace('Hôpital', 'Hospital', case=False)\
        .str.replace('Hopital', 'Hospital', case=False)\
        .str.replace('Hospitais', 'Hospital', case=False)\
        .str.replace(' Hosp ', ' Hospital ', case=False)\
        .str.replace("Urbain", "Urban", case=False)\
        .str.replace("Distrital", "District", case=False)\
        .str.replace("  ", " ")\
        .str.strip()

# remove accent marks
df_iss['clean_name'] = [unidecode.unidecode(n) for n in df_iss['clean_name']]

## Identify prefix/suffix

In [16]:
# split clean name by whitespace
tmp = df_iss['clean_name'].str.split()

In [17]:
# prefix
vowels = set('AEIOU')
df_iss['prefix'] = [t[0].upper() if len(t)>=2 and len(t[0])>1 and len(t[0])<5 and t[0].isalpha()\
                    and t[0].upper() != 'ST' and t[0].upper() != 'DR' and \
                    vowels.isdisjoint(t[0].upper())\
                    else np.nan for t in tmp]

In [18]:
# count how many times that prefix appears
prefix_dict = df_iss.groupby(['Country','prefix'])\
                   .agg(count = ('name_of_facility_visited', 'count'))\
                    .sort_values(['Country', 'count'], ascending=[True, False])
# keep only prefixes that appear more than 100 times
prefix_dict = prefix_dict[prefix_dict['count']>100]
prefix_dict.reset_index(inplace=True)

In [19]:
# suffix
vowels = set('AEIOU')
df_iss['suffix'] = [t[-1].upper() if len(t)>=2 and len(t[-1])>1 and len(t[-1])<5 and t[-1].isalpha()\
                    and t[-1].upper() != 'ST' and t[-1].upper() != 'DR' and \
                    vowels.isdisjoint(t[-1].upper())\
                    else np.nan for t in tmp]

In [20]:
# count how many times that suffix appears
suffix_dict = df_iss.groupby(['Country','suffix'])\
                   .agg(count = ('name_of_facility_visited', 'count'))\
                    .sort_values(['Country', 'count'], ascending=[True, False])
# keep only suffixes that appear more than 100 times
suffix_dict = suffix_dict[suffix_dict['count']>100]
suffix_dict.reset_index(inplace=True)

In [21]:
# Build a dictionary of potentially additional abbreviations
add_abbrev = {}
for idx, row in prefix_dict.iterrows():
    # initialize a set if the country is not yet in the dictionary
    if row['Country'] not in add_abbrev.keys():
        add_abbrev[row['Country']] = set([row['prefix']])
    # otherwise add that prefix to the set
    else:
        add_abbrev[row['Country']].add(row['prefix'])
for idx, row in suffix_dict.iterrows():
    # initialize a set if the country is not yet in the dictionary
    if row['Country'] not in add_abbrev.keys():
        add_abbrev[row['Country']] = set([row['suffix']])
    # otherwise add that prefix to the set
    else:
        add_abbrev[row['Country']].add(row['suffix']) 

In [22]:
# Build a dictionary of additional abbreviations identified from ISS data
new = {}
for group_name in df_iss['Country'].unique():
    
    tmp = type_dict[type_dict['Country'].str.upper()==group_name]
    abbrevs = set(tmp['Abbreviation'])
    
    if group_name in add_abbrev.keys():
        add_prefix = add_abbrev[group_name]
        for p in add_prefix:
            if p not in abbrevs and group_name not in new.keys():
                new[group_name] = set([p])            
            if p not in abbrevs and group_name in new.keys():
                new[group_name].add(p)
print("New abbreviations added to the type dictionary:")
for ctr in new:
    print(ctr, new[ctr])

New abbreviations added to the type dictionary:
BENIN {'CS'}
BURUNDI {'CDS'}
CAMEROON {'CS'}
CHAD {'CS'}
GUINEA {'CS'}
GUINEA BISSAU {'CS'}
MOZAMBIQUE {'CS'}
NIGERIA {'MCH', 'CPHC', 'PMS', 'PMV', 'MDG', 'NKST', 'FSP', 'MDGS', 'TH', 'MCHC', 'PHCC', 'HF', 'WCWC'}


## Infer type and append to the type dictionary

In [23]:
# Infer facility type and add new rows to the type dictionary
additions = pd.DataFrame([['Benin', 'Centre de Sante', 'CS', np.nan],
                         ['Burundi', 'Centre de Sante', 'CDS', np.nan],
                         ['Cameroon', 'Centre de Sante', 'CS', np.nan],
                         ['Chad', 'Centre de Sante', 'CS', np.nan],
                         ['Guinea', 'Centre de Sante', 'CS', np.nan],
                         ['Guinea Bissau', 'Centre de Sante', 'CS', np.nan],
                         ['Guinea Bissau', 'Centro de Saude', 'CS2', np.nan],
                         ['Mozambique', 'Centro de Saude', 'CS', np.nan],
                         ['Nigeria', 'Traditional Spirit Healer', 'TH', np.nan],
                         ['Nigeria', 'Maternal and Child Health', 'MCH', np.nan],
                         ['Nigeria', 'Health Facility', 'HF', np.nan],
                         ['Nigeria', 'Family Support Program', 'FSP', np.nan],
                         ['Nigeria', 'Millennium Development Goal', 'MDG', np.nan],                                                  
                         ['Nigeria', 'Maternal and Child Health Centre', 'MCHC', np.nan],
                         ['Nigeria', 'Comprehensive Primary Health Care', 'CPHC', np.nan],                                                                           
                         ['Nigeria', 'Primary Health Care Centre', 'PHCC', np.nan],
                         ['Nigeria', 'Patent Medicine Vendor', 'PMV', np.nan],
                         ['Nigeria', 'Patent Medicine Store', 'PMS', np.nan],
                          # obtained by examining the ISS data
                         ['Central African Republic', 'Hospital District', 'HD', np.nan],
                         ['Liberia', 'General Hospital', 'CH', np.nan],
                         ['Guinea', 'Centre de Sante Urban', 'CSU', np.nan],
                         ['Guinea', 'Centre de Sante Rural', 'CSR', np.nan]],                                                                      
                         columns=['Country', 'Type', 'Abbreviation', 'count'])

type_dict = type_dict.append(additions)

In [24]:
# sort by country and type
type_dict.sort_values(by=['Country', 'Type'], inplace=True)

In [25]:
# Check for each country, every facility type has a unique abbreviation
tmp = type_dict.groupby('Country').agg(unique_type = ('Type', 'nunique'),
                                unique_abbrev = ('Abbreviation', 'nunique'))
tmp[tmp['unique_type']!=tmp['unique_abbrev']]

Unnamed: 0_level_0,unique_type,unique_abbrev
Country,Unnamed: 1_level_1,Unnamed: 2_level_1


# Make further improvement/adjustment based on ISS Cleaning

## `clean_name_final`

Use facility type and abbreviations in the type dictionary as keywords and remove type information from `clean_name` to create the `clean_name_final` column.

Sometimes there is a whitespace between abbreviation, e.g. C S. Corrections are made accordingly.

In [26]:
# obtain abbreviations of length 2 or 3
tmp = type_dict[type_dict['Abbreviation'].str.len()<=3]['Abbreviation'].unique()
tmp = sorted(tmp, key=len, reverse=True)
# change it to the pattern '^c s ' or 'c s$'
tmp_dict = {}
for t in tmp:
    tmp_dict[t] = ['^'+' '.join(list(t))+' ', ' '+' '.join(list(t))+'$']
# replace the pattern with 'cs'
for t in tmp:
    pats = tmp_dict[t]
    df_iss['clean_name'] = df_iss['clean_name'].str.replace(pats[0], t+' ',case=False)\
    .str.replace(pats[1], ' '+t, case=False)

Remove type information.

In [27]:
df_grouped = df_iss.groupby('Country')
res = pd.DataFrame()
type_keywords_dict = {}
abb_keywords_dict = {}

for group_name, df_group in df_grouped:
    # obtain the type dictionary for that country
    tmp = type_dict[type_dict['Country'].str.upper()==group_name]
    
    # facility types for that country
    types = list(tmp['Type'])
    type_keywords = set()
    for t in types:
        # add the facility type as it is and also the unidecoded version
        t = t.title()
        type_keywords.add(t)                 

        # add individual words as well
        t = t.replace('/', ' ')
        words = t.split(' ')
        # skip words that have punctuation and have length <= 3 (e.g. de, (major))
        words = [w for w in words if w.isalpha() and len(w)>3]
        for w in words:
            type_keywords.add(w)

    # obtain the list of type keywords and sort in descending length
    type_keywords = list(type_keywords)
    type_keywords = sorted(type_keywords, key=lambda s: -len(s))
    type_keywords_dict[group_name] = type_keywords

    # abbreviations for that country
    abbrevs = set(tmp['Abbreviation'])
        
    abb_keywords = []
    for abbrev in abbrevs:
        # e.g. for CS, possible matches include facility names that start with, end with CS or 
        # contains the string ' CS ' in between
        abbrev = abbrev.title()
        abb_keywords.extend(['^'+abbrev+'\s', '\s'+abbrev+'\s', '\s'+abbrev+'$',
                            '^'+abbrev+'$'])
        
    # obtain the list of abbreviation keywords and sort in descending length
    abb_keywords = sorted(abb_keywords, key=lambda s: -len(s))  
    abb_keywords_dict[group_name] = abb_keywords
    
    if group_name == 'UGANDA':
        df_group['clean_name'] = df_group['clean_name'].str.replace("HC II$", "HCII")\
        .str.replace("HC III$", "HCIII")\
        .str.replace("HC IV$", "HCIV")
    
    # replace double whitespaces with a single space
    df_group['clean_name_final'] = df_group['clean_name'].str.title()\
        .str.replace('|'.join(type_keywords), '')\
        .str.replace('|'.join(abb_keywords), ' ')\
        .str.strip()\
        .str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
                     ' ', case=False)\
        .str.strip()\
        .str.replace("  ", " ")\
        .str.title()
    res = pd.concat([res, df_group])

In [28]:
res.shape

(305182, 47)

In [29]:
# randomly sample rows to examine results
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'type_of_facility_visited']
res[cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,type_of_facility_visited
281255,DEMOCRATIC REPUBLIC OF THE CONGO,Bankoko,Bankoko,Bankoko,CSI
233841,NIGERIA,OTHER,OTHER,Other,PHC_CENTER
243285,NIGERIA,Bulala patient medicine vendor,Bulala patient medicine vendor,Bulala Patient,PATENT_MED_VENDORS
273438,DEMOCRATIC REPUBLIC OF THE CONGO,Boso mbifa,Boso mbifa,Boso Mbifa,CSI
6116,CAMEROON,HD Poli,HD Poli,Poli,DISTRICT_HOSP


## `extract_type`

Extract facility type information by removing `clean_name_final` from `clean_name`.

In [30]:
extract_types = []

for idx, row in res.iterrows():
    clean_name = row['clean_name'].upper()
    clean_name_final = row['clean_name_final'].upper()
    
    if clean_name.upper() == clean_name_final.upper():
        extract_types.append(np.nan)
    
    else:
        clean_name = OrderedSet(clean_name.split())
        clean_name_final = OrderedSet(clean_name_final.split())
        
        extract_type = ' '.join(list(clean_name.difference(clean_name_final)))
        extract_types.append(extract_type.strip())
        
res['extract_type'] = extract_types
res['extract_type'] = res['extract_type'].str.strip()\
.str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
            ' ', case=False)\
.str.replace("  ", " ")\
.str.strip()\
.replace('',np.nan)

In [31]:
# randomly sample rows to examine results
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'extract_type', 'type_of_facility_visited']
res[cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,extract_type,type_of_facility_visited
26125,CONGO,Csi soeur martin,Csi soeur martin,Soeur Martin,CSI,FAITH_BASED
57573,MALI,CSCOM de Lafiabougou,CSCOM de Lafiabougou,Lafiabougou,CSCOM,CSI
224765,NIGERIA,PHCC Watinane,PHCC Watinane,Watinane,PHCC,PHC_CENTER
123107,NIGERIA,HC KAKIDARE,HC KAKIDARE,Kakidare,HC,PHC_CENTER
25716,CONGO,CSI de Dongou,CSI de Dongou,Dongou,CSI,CSI


In [32]:
print("Percentage of NA in extract type column:",
     round(res['extract_type'].isna().sum()/res.shape[0]*100,1))

Percentage of NA in extract type column: 16.8


### Examining NA values in `extract_type`

Randomly sample 10 rows from each country and examine the relevant columns.

In [33]:
na = res[pd.isna(res['extract_type'])]
print("Number of NA values:", na.shape[0])

Number of NA values: 51405


In [34]:
na_sampled = pd.DataFrame()
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'extract_type', 'type_of_facility_visited']
for c in na['Country'].unique():
    # obtain relevant columns for that country only
    tmp = na[na['Country']==c][cols]
    # randomly sample 10 rows if possible
    if tmp.shape[0]>10:
        na_sampled = pd.concat([na_sampled, tmp.sample(10, random_state=0)])
    # if the number of rows is less than 10, just append the entire subset
    else:
        na_sampled = pd.concat([na_sampled, tmp])

In [35]:
print("Number of unique countries:", na['Country'].nunique())
print("Number of points sampled:", na_sampled.shape[0])

Number of unique countries: 42
Number of points sampled: 402


In [36]:
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Cleaned"
filename="na_sampled_1109.csv"
path = os.path.join(saveDir, filename)
na_sampled.to_csv(path)

The sampled rows with NA in `extract_type` are then manually examined to check whether additional rows could be added to the type dictionary. 

In [37]:
# additions to type dictionary after examining sampled NA rows
type_dict_add = pd.DataFrame([["Equatorial Guinea", "Centre de Sante", "CS", np.nan],
                              ["Guinea Bissau", "Centro Materno Infantil", "CMI", np.nan],
                              ["Lesotho", "Hospital", "HOSP", np.nan],
                              ["Madagascar", "Basic Health Center I", "CSB1", np.nan],
                              ["Madagascar", "Basic Health Center II", "CSB2", np.nan],
                              ["Madagascar", "District Hospital", "CHD1", np.nan],
                              ['Mauritania', "Centre de Sante", "CS", np.nan],
                              ['Mauritania', "Poste de Sante", "PS", np.nan],                   
                              ["Mauritius", "Social Welfare Centre", "SWC", np.nan],                            
                              ["Senegal", "Centre de Sante", "CS", np.nan],
                              ["South Africa", "Hospital", "HOSP", np.nan]],
                            columns = ['Country', 'Type', 'Abbreviation', 'count'])

In [38]:
type_dict = pd.concat([type_dict,type_dict_add])

Use the additions to the type dictionary to create `clean_name_final` and `extract_type` for rows previously with NA in `extract_type`.

In [39]:
df_grouped = na.groupby('Country')
na_res = pd.DataFrame()
add_type_keywords_dict = {}
add_abb_keywords_dict = {}

for group_name, df_group in df_grouped:
    
    if group_name not in type_dict_add['Country'].str.upper().unique():
        na_res = pd.concat([na_res, df_group])
        
    else:
        # obtain the type dictionary for that country
        tmp = type_dict_add[type_dict_add['Country'].str.upper()==group_name]

        # facility types for that country
        types = list(tmp['Type'])
        type_keywords = set()
        for t in types:
            # add the facility type as it is and also the unidecoded version
            t = t.title()
            type_keywords.add(t)                 

            # add individual words as well
            t = t.replace('/', ' ')
            words = t.split(' ')
            # skip words that have punctuation and have length <= 3 (e.g. de, (major))
            words = [w for w in words if w.isalpha() and len(w)>=3]
            for w in words:
                type_keywords.add(w)

        # obtain the list of type keywords and sort in descending length
        type_keywords = list(type_keywords)
        type_keywords = sorted(type_keywords, key=lambda s: -len(s))
        add_type_keywords_dict[group_name] = type_keywords

        # abbreviations for that country
        abbrevs = set(tmp['Abbreviation'])

        abb_keywords = []
        for abbrev in abbrevs:
            # e.g. for CS, possible matches include facility names that start with, end with CS or 
            # contains the string ' CS ' in between
            abbrev = abbrev.title()
            abb_keywords.extend(['^'+abbrev+'\s', '\s'+abbrev+'\s', '\s'+abbrev+'$',
                                '^'+abbrev+'$'])

        # obtain the list of abbreviation keywords and sort in descending length
        abb_keywords = sorted(abb_keywords, key=lambda s: -len(s))  
        abb_keywords_dict[group_name] = abb_keywords

        # replace double whitespaces with a single space
        df_group['clean_name_final'] = df_group['clean_name'].str.title()\
            .str.replace('|'.join(type_keywords), '')\
            .str.replace('|'.join(abb_keywords), ' ')\
            .str.strip()\
            .str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
                         ' ', case=False)\
            .str.strip()\
            .str.replace("  ", " ")\
            .str.title()
        na_res = pd.concat([na_res, df_group])

In [40]:
extract_types = []

for idx, row in na_res.iterrows():
    clean_name = row['clean_name'].upper()
    clean_name_final = row['clean_name_final'].upper()
    
    if clean_name.upper() == clean_name_final.upper():
        extract_types.append(np.nan)
    
    else:
        clean_name = OrderedSet(clean_name.split())
        clean_name_final = OrderedSet(clean_name_final.split())
        
        extract_type = ' '.join(list(clean_name.difference(clean_name_final)))
        extract_types.append(extract_type.strip())
        
na_res['extract_type'] = extract_types
na_res['extract_type'] = na_res['extract_type'].str.strip()\
.str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
              ' ', case=False)\
.str.replace("  ", " ")\
.str.strip()\
.replace('',np.nan)

In [41]:
print("Number of NA values before:", na.shape[0])
print("Number of NA values after:", na_res['extract_type'].isna().sum())

Number of NA values before: 51405
Number of NA values after: 51013


In [42]:
# randomly sample rows to examine results
# new points where type information can be extracted
cols = ['Country', 'name_of_facility_visited', 'clean_name', 
        'clean_name_final', 'extract_type', 'type_of_facility_visited']
na_res[(~pd.isna(na_res['extract_type']))][cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,extract_type,type_of_facility_visited
47617,MADAGASCAR,CHD1 FANDANA,CHD1 FANDANA,Fandana,CHD1,DISTRICT_HOSP
46516,MADAGASCAR,CHD1 SAKARAHA,CHD1 SAKARAHA,Sakaraha,CHD1,DISTRICT_HOSP
61461,MAURITANIA,PS MBEDIA SAKHA,PS MBEDIA SAKHA,Mbedia Sakha,PS,CSI
27122,EQUATORIAL GUINEA,CS Moca,CS Moca,Moca,CS,CSI
61551,MAURITANIA,CS TAYARETT,CS TAYARETT,Tayarett,CS,CSI


In [43]:
# rejoin the new results to the existing results
res = pd.concat([na_res, res[~pd.isna(res['extract_type'])]])

Examine randomly sampled rows of top 10 countries with the highest number of NAs / highest percentage of NAs in `extract_type` to make further additions.

In [44]:
na_summ = pd.DataFrame()
df_grouped = res.groupby('Country')
for ctr, df_group in df_grouped:
    nas = df_group['extract_type'].isna().sum()
    n = df_group.shape[0]
    tmp = pd.DataFrame([[ctr,nas,n]], columns=['Country','#NA','N'])
    na_summ = pd.concat([na_summ, tmp])
na_summ['pertcentage_NA'] = na_summ['#NA']/na_summ['N'] 
na_summ = na_summ.sort_values(['#NA'], ascending=False)
na_summ.reset_index(inplace=True,drop=True)
na_summ.iloc[:10,:]

Unnamed: 0,Country,#NA,N,pertcentage_NA
0,NIGERIA,19132,206552,0.092626
1,CHAD,5512,9900,0.556768
2,DEMOCRATIC REPUBLIC OF THE CONGO,4088,13508,0.302635
3,MADAGASCAR,3566,3657,0.975116
4,SOUTH SUDAN,2542,5279,0.481531
5,MALI,2058,10489,0.196206
6,LIBERIA,1712,3877,0.441579
7,MALAWI,1583,2152,0.735595
8,GABON,1445,3501,0.412739
9,NIGER,1288,2010,0.640796


In [45]:
top_10_na_num = na_summ.iloc[:10,:]['Country'].unique()
print("Top 10 countries with highest number of NAs in extract_type:")
print(top_10_na_num)

Top 10 countries with highest number of NAs in extract_type:
['NIGERIA' 'CHAD' 'DEMOCRATIC REPUBLIC OF THE CONGO' 'MADAGASCAR'
 'SOUTH SUDAN' 'MALI' 'LIBERIA' 'MALAWI' 'GABON' 'NIGER']


In [46]:
na_summ.sort_values(['pertcentage_NA'], ascending=False).iloc[:10,:]

Unnamed: 0,Country,#NA,N,pertcentage_NA
3,MADAGASCAR,3566,3657,0.975116
34,SEYCHELLES,45,60,0.75
7,MALAWI,1583,2152,0.735595
9,NIGER,1288,2010,0.640796
1,CHAD,5512,9900,0.556768
30,RWANDA,101,185,0.545946
26,EQUATORIAL GUINEA,134,246,0.544715
4,SOUTH SUDAN,2542,5279,0.481531
12,CENTRAL AFRICAN REPUBLIC,790,1729,0.456912
6,LIBERIA,1712,3877,0.441579


In [47]:
top_10_na_perc = na_summ.sort_values(['pertcentage_NA'], ascending=False).iloc[:10,:]\
['Country'].unique()
print("Top 10 countries with highest percentage of NAs in extract_type:")
print(top_10_na_perc)

Top 10 countries with highest percentage of NAs in extract_type:
['MADAGASCAR' 'SEYCHELLES' 'MALAWI' 'NIGER' 'CHAD' 'RWANDA'
 'EQUATORIAL GUINEA' 'SOUTH SUDAN' 'CENTRAL AFRICAN REPUBLIC' 'LIBERIA']


In [48]:
# new additions
type_dict_add = pd.DataFrame([['Gabon', 'Clinic', 'CLINIC', np.nan],
                             ['Gabon', 'Polyclinic', 'PCLINIC', np.nan],
                             ['Equatorial Guinea', 'General Hospital', 'HG', np.nan]],
                             columns = ['Country', 'Type', 'Abbreviation', 'count'])
type_dict = pd.concat([type_dict,type_dict_add])

## `sub_type`

Use `extract_type` to map the type information extracted from the name column to one of the types in the type dictionary.

In [49]:
df_grouped = res.groupby('Country')
res_sub_type = pd.DataFrame()
for country_name in res['Country'].unique():
    df_group = res[res['Country']==country_name]
    tmp = type_dict[type_dict['Country'].str.upper()==country_name]
    types = tmp['Type']
    abbrevs = tmp['Abbreviation']
    sub_types = []
    scores = []
    
    for idx, row in df_group.iterrows():
        if not isinstance(row['extract_type'],str):
            sub_types.append(np.nan)
            scores.append(np.nan)
        else:
            match, score = process.extractOne(row['extract_type'], list(types)+list(abbrevs), 
                                           scorer = fuzz.ratio)
            scores.append(score)
            if match in list(abbrevs):
                sub_type = tmp[tmp['Abbreviation']==match]['Type'].iloc[0]
                sub_types.append(sub_type)
            else:
                sub_types.append(match) 
    df_group['sub_type'] = sub_types
    df_group['score'] = scores
    res_sub_type = pd.concat([res_sub_type, df_group])

In [50]:
res_sub_type.shape

(305182, 50)

In [51]:
# randomly sample rows to examine the results
cols=['Country', 'name_of_facility_visited', 'clean_name', 'clean_name_final', 
      'extract_type', 'type_of_facility_visited', 'sub_type', 'score']
res_sub_type[cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,extract_type,type_of_facility_visited,sub_type,score
246378,NIGERIA,PHC Wuro Cheudo,PHC Wuro Cheudo,Wuro Cheudo,PHC,PHC_CENTER,Primary Health Centre,100.0
71705,NIGERIA,Gasamu PHCC,Gasamu PHCC,Gasamu,PHCC,PHC_CENTER,Primary Health Care Centre,100.0
278382,DEMOCRATIC REPUBLIC OF THE CONGO,CS Saint André,CS Saint Andre,Saint Andre,CS,FAITH_BASED,Centre de Sante,100.0
196891,NIGERIA,Alikolo m kawu ( AVADAR community informants),Alikolo m kawu AVADAR community informants,Alikolo M Kawu Avadar Community Informants,,OTHER_NON_ORTHORDOX_HC,,
185399,NIGERIA,UNICEF CLINIC WATERBOARD,UNICEF CLINIC WATERBOARD,Unicef Waterboard,CLINIC,PHC_CENTER,Clinic,100.0


In [52]:
print("Summary statistics of match score")
res_sub_type['score'].describe()

Summary statistics of match score


count    254169.000000
mean         97.153197
std           9.137719
min          24.000000
25%         100.000000
50%         100.000000
75%         100.000000
max         100.000000
Name: score, dtype: float64

Examine randomly sampled rows of countries where the match score is below average and the number of points is greater than 1000. Then make further additions.

In [53]:
score_summ = res_sub_type.groupby('Country').agg(avg_score = ('score', 'mean'),
                                   count=('score','count'))\
.sort_values(['count', 'avg_score'], ascending=[False, True])

score_summ[(score_summ['avg_score']<res_sub_type['score'].mean())&
          (score_summ['count']>1000)]

Unnamed: 0_level_0,avg_score,count
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
MALI,97.02135,8431
CAMEROON,95.174937,6791
CHAD,96.281222,4388
ZAMBIA,91.974367,3199
SOUTH SUDAN,95.211545,2737
ANGOLA,81.618975,2635
TANZANIA,96.026305,2395
GABON,82.118677,2056
UGANDA,94.740081,1739
CONGO,91.204918,1220


In [54]:
type_dict_additions = [['Mali', 'Centre de Sante', 'CS', np.nan],
                       ['Mali', 'Centre de Kinesitherapie', 'CK', np.nan],
                       ['Cameroon', 'Hospital', 'HOSP', np.nan],                      
                       ["Zambia", "Hospital Affiliated Health Centre", "HAHC", np.nan],
                       ["Zambia", "Referral Hospital", "RH", np.nan],
                       ["Zambia", "District Hospital", "DH", np.nan],
                       ["Zambia", "General Hospital", "GH", np.nan],
                       ["Zambia", "Mission Hospital", "MH", np.nan],
                       ["Zambia", "Universty Teaching Hospital", "UTH", np.nan],
                       ["Zambia", "Hospital", "HOSP", np.nan], 
                       ['South Sudan', 'Hospital', 'HOSP', np.nan],
                       ["Gabon", "Hospital", "HOSP", np.nan],
                       ["Gabon", "Centre de Sante", "CS", np.nan],
                       ["Congo", "Poste de Sante", "PS", np.nan]]
dict_additions = pd.DataFrame(type_dict_additions, 
                              columns=['Country', 'Type', 'Abbreviation', 'count'])
type_dict = pd.concat([type_dict, dict_additions])
type_dict.sort_values(by=['Country','Type'], inplace=True)

In [55]:
# Check for each country, every facility type has a unique abbreviation
tmp = type_dict.groupby('Country').agg(unique_type = ('Type', 'nunique'),
                                unique_abbrev = ('Abbreviation', 'nunique'))
tmp[tmp['unique_type']!=tmp['unique_abbrev']]

Unnamed: 0_level_0,unique_type,unique_abbrev
Country,Unnamed: 1_level_1,Unnamed: 2_level_1


In [56]:
# export the results
type_dict.to_csv(saveDir + "//type_dict_1109.csv", index=False)