This notebook builds a new or augments an existing spelling dictionary to correct possible misspellings of facility type keywords in a facility name column. 

The output columns include `Country`, `Word`, `Misspelling`, `Frequency`, `Score`.

- `Country`: country name.
- `Word`: the correct facility type keyword.
- `Misspelling`: misspelling.
- `Frequency`: frequency of the misspelling's appearance.
- `Score`: similarity score between `Word` and `Misspelling`, scales from 0 to 100.

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
from symspellpy import SymSpell
from itertools import islice
from fuzzywuzzy import fuzz
from ordered_set import OrderedSet
from fuzzywuzzy import process
import unidecode

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

In [4]:
# data import
# import dataset as df
dataDir = r"C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\GRID3\Healthsites"
priority_countries = ['South Sudan', 'Mozambique', 'Namibia', 'Nigeria', 'Zambia',
                      'Sierra Leone', 'Ghana',  'Burkina Faso', 'Ethiopia', 'Somalia',
                     'Rwanda', 'Kenya', 'Zimbabwe', 'Democratic Republic of the Congo']

dfs = []
for i in range(len(priority_countries)):
    country = priority_countries[i]
    filename = country + '-node.shp'
    path = os.path.join(dataDir, country, filename)
    df = gpd.read_file(path)
    df['country'] = country
    dfs.append(df)

df = pd.concat(dfs, axis=0)
df.reset_index(drop=True, inplace=True)
df.reset_index(inplace=True)

# import type dictionary as type_dict
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa"
type_dict = pd.read_csv(dataDir + "//type_dict_1210.csv")

# import existing spelling dictionary as old_spelling_dict (optional if APPEND is False)
old_spelling_dict = pd.read_csv('C:\\Users\\DUANYUEYUN\\Documents\\GRID3\\Health facilities\\Data\\Africa\\Spelling dict\\spelling_dict_1210.csv')

In [5]:
# import dataset as df

# import type dictionary as type_dict

# import existing spelling dictionary as old_spelling_dict (optional if APPEND is False)

In [6]:
# facility name column
FACILITY_NAME = 'name'
# column name for pre-cleaned facility name
CLEAN_NAME = 'clean_name'
# country column
COUNTRY_COL = 'country'
# path to save the results
SAVE_PATH = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Spelling dict\spelling_dict_add_1213.csv"
# True if appending the results to an existing spelling dictionary
APPEND = False

In [7]:
# check if there's any country that does not exist in type dictionary
for c in df[COUNTRY_COL].unique():
    if c.upper() not in type_dict['Country'].str.upper().unique():
        print(c)

In [8]:
print("Country names in the type dictionary:")
print(type_dict['Country'].unique())

Country names in the type dictionary:
['Angola' 'Benin' 'Botswana' 'Burkina Faso' 'Burundi' 'Cameroon'
 'Cape Verde' 'Central African Republic' 'Chad' 'Comoros' 'Congo'
 "Cote d'Ivoire" 'Democratic Republic of the Congo' 'Djibouti'
 'Equatorial Guinea' 'Eritrea' 'Ethiopia' 'Gabon' 'Gambia' 'Ghana'
 'Guinea' 'Guinea Bissau' 'Kenya' 'Lesotho' 'Liberia' 'Madagascar'
 'Malawi' 'Mali' 'Mauritania' 'Mauritius' 'Mozambique' 'Namibia' 'Niger'
 'Nigeria' 'Rwanda' 'Sao Tome and Principe' 'Senegal' 'Seychelles'
 'Sierra Leone' 'Somalia' 'South Africa' 'South Sudan' 'Sudan' 'Tanzania'
 'Togo' 'Uganda' 'Zambia' 'Zanzibar' 'Zimbabwe' 'eSwatini']


# Pre-cleaning on facility name

In [9]:
def preclean(df, facility_name, clean_name):
    """Performs pre-cleaning on facility name, including 
    removing punctuations and """
    
    # replace NAs with empty string ''
    df[facility_name] = df[facility_name].fillna('')
    
    df[clean_name] = df[facility_name].str.strip()\
            .str.replace("  ", " ")\
            .str.replace('.', ' ')\
            .str.replace(':', ' ')\
            .str.replace("'", ' ')\
            .str.replace('"', ' ')\
            .str.replace('[', ' ')\
            .str.replace(']', ' ')\
            .str.replace('[-_,/\(\);]', ' ')\
            .str.replace('&', ' and ')\
            .str.replace("  ", " ")\
            .str.strip()\
            .str.replace('center', 'centre', case=False)\
            .str.replace('Polyclinique', 'Polyclinic', case=False)\
            .str.replace('Clinique', 'Clinic', case=False)\
            .str.replace('Geral', 'General', case=False)\
            .str.replace('Dispensaire', 'Dispensary', case=False)\
            .str.replace('Hopital', 'Hospital', case=False)\
            .str.replace('Hospitais', 'Hospital', case=False)\
            .str.replace("Urbain", "Urban", case=False)\
            .str.replace("Distrital", "District", case=False)\
            .str.replace('^hosp | hosp | hosp$|^hosp$', ' Hospital ', case=False)\
            .str.replace("  ", " ")\
            .str.strip()
    
    # replace NAs in clean_name with empty string ''
    df[clean_name] = df[clean_name].fillna('')
    
    # change emptry string in facility_name back to NA
    df[facility_name] = df[facility_name].replace('', np.nan)

    # remove accent marks
    df[clean_name] = [unidecode.unidecode(n) for n in df[clean_name]]

In [10]:
def generate_misspellings(df, type_dict, country_name, clean_name, country_col, 
                          skip_spellings=[], min_length=6):
    
    """
    skip_spellings: list of spellings that should not be considered as misspellings and should be skipped.
    min_length: minimum length of type keywords to check for possible misspellings.
    """
    # obtain type dictionary for the country
    type_dict_ctr = type_dict[type_dict['Country'].str.upper()==country_name.upper()]
    # obtain country-specific type keywords
    type_keywords = ' '.join(list(type_dict_ctr['Type'].str.lower())).split()
    
    # convert from list to set to remove repeating words, then convert to list again
    type_keywords_all = list(set(type_keywords))
    # keep only keywords with the minimum length
    type_keywords_to_check = [word for word in type_keywords_all if len(word)>=min_length]
    
    # obtain dataset for the country
    df_ctr = df[df[country_col].str.upper()==country_name.upper()]
    # obtain a list of words that appear in precleaned names
    names = ' '.join(list(df_ctr[~pd.isna(df_ctr[clean_name])][clean_name].str.lower())).split()

    columns = ['Country', 'Word', 'Misspelling', 'Frequency', 'Score']
    results = pd.DataFrame()
    
    for word in type_keywords_to_check:
        # keep just words that start with the same letter as the type keyword
        # and have length at least half of the length of the type keyword
        # also remove the words that already appear in type keywords
        start_char = word[0] # first letter
        min_len = len(word)//2 # minimum length requirement
        names_word = [name for name in names if name.startswith(start_char) 
                      and len(name)>min_len and name not in type_keywords_all]

        # write the relevant words to a text file
        filename = word+".txt"
        file1 = open(filename,"w")
        file1.write(' '.join(names_word))
        file1.close() 

        # generate word frequency dictionary
        sym_spell = SymSpell()
        sym_spell.create_dictionary(filename)
        freq_dict = sym_spell.words
        # remove the text file
        os.remove(filename)

        # compute similarity score with respect to the original word
        
        threshold = (len(word)-1) / len(word) # score threshold
        for spelling, frequency in freq_dict.items():
            if spelling in skip_spellings:
                continue
            ratio = fuzz.ratio(spelling, word)
            if ratio/100>=threshold:
                new_row = pd.DataFrame([[country_name, word, spelling, frequency, ratio]], columns=columns)
                results = pd.concat([results, new_row])
    #results = results[results['Score']<100]
    if results.shape[0]>0:
        results['Country'] = results['Country'].str.upper()
    # reset and drop index
    results.reset_index(inplace=True, drop=True)
    return results

1. Using type dictionary, find the list of facility type keywords to check for misspellings.

    - `min_length`: minimum length of keywords required, default=6.


2. For each type keyword, obtain a list of word candidates that might be misspellings and then transform the list into a frequency dictionary.

    - words that appear in precleaned names.
    - words that start with the same first letter as the type keyword. 
    - words with length that is at least half of the length of the type keyword.
    - words that do not appear in type keywords.


3. For each word candidate, compute similarity score with the target type keyword.

If the similarity score is greater than a threshold, add the word as a misspelling.

In [11]:
# pre-cleaning on facility names
preclean(df, facility_name = FACILITY_NAME, clean_name = CLEAN_NAME)

In [12]:
spelling_dict = pd.DataFrame()

for country_name in df[COUNTRY_COL].unique():
    print(country_name)
    # generate misspellings for that country
    country_results = generate_misspellings(df, type_dict, country_name, clean_name=CLEAN_NAME,
                                           country_col=COUNTRY_COL) 
    # merge country results to all results
    spelling_dict = pd.concat([spelling_dict, country_results])
# reset and drop index
spelling_dict.reset_index(inplace=True, drop=True)

South Sudan
Mozambique
Namibia
Nigeria
Zambia
Sierra Leone
Ghana
Burkina Faso
Ethiopia
Somalia
Rwanda
Kenya
Zimbabwe
Democratic Republic of the Congo


In [13]:
print(spelling_dict.shape)

(39, 5)


In [14]:
spelling_dict.head()

Unnamed: 0,Country,Word,Misspelling,Frequency,Score
0,MOZAMBIQUE,centro,centros,2,92
1,MOZAMBIQUE,centro,centrode,2,86
2,NIGERIA,patent,patient,1,92
3,NIGERIA,university,univeraity,1,90
4,NIGERIA,federal,federeal,1,93


In [15]:
if APPEND:
    # add new rows to existing spelling dictionary
    new_spelling_dict = pd.concat([old_spelling_dict, spelling_dict])
    new_spelling_dict = new_spelling_dict.groupby(['Country', 'Word', 'Misspelling'])\
    .agg(Frequency=('Frequency', 'sum'), Score=('Score', 'mean'))
    new_spelling_dict.reset_index(inplace=True)
    print("Number of new rows added:", new_spelling_dict.shape[0]-old_spelling_dict.shape[0])
    new_spelling_dict.to_csv(SAVE_PATH, index=False)
else:
    # if not append, just write the results into a .csv file
    spelling_dict.to_csv(SAVE_PATH, index=False)

Number of new rows added: 29
