This notebook could be used to detect potential facility types in the facility name that cannot extracted by the General Name Cleaning & Type Mapping Notebook. The overall approach is to filter most frequent words that appear in the `CLEAN_NAME_FINAL` column and identify potential types via manual inspection of sampled rows containing those words. The assumption is that facility type keywords will appear a lot of times in the `CLEAN_NAME_FINAL` column which is the output  after removing type information using existing type dictionary.

In [1]:
import numpy as np
import pandas as pd
import os
from symspellpy import SymSpell
from itertools import islice

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

In [3]:
# data import
# import dataset as df
path = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\healthsites_cleaned.csv"
df = pd.read_csv(path)
# import existing type dictionary as type_dict
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa"
type_dict = pd.read_csv(dataDir + "//type_dict_1210.csv")

In [4]:
# facility name column
FACILITY_NAME = 'name'
# country column
COUNTRY = 'country'
# final clean name after removing type information
CLEAN_NAME_FINAL = 'clean_name_final' 
# type information extracted
EXTRACT_TYPE = 'type_extract' 
# columns to sample for inspection
cols = [COUNTRY, FACILITY_NAME, CLEAN_NAME_FINAL]
# columns of type dictionary
type_dict_cols = ['Country', 'Type', 'Abbreviation', 'count']

In [5]:
# obtain rows with NA in extract_type
sample = df[(pd.isna(df[EXTRACT_TYPE]))]

In [6]:
print("Number of data points:", sample.shape[0])

Number of data points: 3068


In [7]:
print("Distribution of data points by country:")
ctr_counts = sample[COUNTRY].value_counts()
ctr_counts

Distribution of data points by country:


Ghana                               752
Democratic Republic of the Congo    494
Ethiopia                            431
Kenya                               401
Burkina Faso                        312
Sierra Leone                        173
Nigeria                             124
Mozambique                           92
Namibia                              92
Zimbabwe                             74
Rwanda                               45
Zambia                               44
Somalia                              21
South Sudan                          13
Name: country, dtype: int64

In [8]:
def get_countries(ctr_counts, threshold):
    """return a list of countries with number of data points greater than the threshold"""
    return list(ctr_counts[ctr_counts>threshold].index)

In [9]:
def gen_word_freq_dict(df, country_col, country_name, clean_name_final, freq_threshold):
    """return a keyword frequency dictionary of words that appear in clean_name_final
    and the frequency is above freq_threshold"""
    
    # obtain country df with no NAs in clean_name_final
    tmp = df[(df[country_col].str.upper()==country_name.upper())&(~pd.isna(df[clean_name_final]))]
    
    # write list of words that appear in clean_name_final into a  text  file
    filename = country_name+".txt"
    file1 = open(filename,"w")
    names_word = list(tmp[clean_name_final].str.lower())
    file1.write(' '.join(names_word)) 
    file1.close() 
    
    # create a word frequency dictionary
    sym_spell = SymSpell()
    sym_spell.create_dictionary(filename)
    d = sym_spell.words
    
    # sort in decreasing frequency
    sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: -item[1])}
    # filter by frequency threshold
    sorted_d = {k: v for k, v in sorted_d.items() if v>freq_threshold}
    
    os.remove(filename)
    
    return sorted_d

In [10]:
def sample_rows(df, country_col, country_name, words, sample_size, clean_name_final, cols):
    """for each word, sample rows from the df where clean_name_final contains the word"""
    
    results = pd.DataFrame()
    ctr_df = df[(df[country_col].str.upper()==country_name.upper())]
    for word in words:
        tmp = ctr_df[ctr_df[clean_name_final].str.contains(word, case=False, na=False)][cols]
        if tmp.shape[0]>sample_size:
            tmp = tmp.sample(sample_size)
        tmp['keyword'] = word
        results = pd.concat([results, tmp])
        ctr_df = ctr_df[~ctr_df[clean_name_final].str.contains(word, case=False, na=False)]
    return results

In [23]:
def add_rows(additions, type_dict, type_dict_cols):
    
    """add new rows to existing type dictionary.
    additions: dictionary with keys being countries and values being pairs of facility type and abbreviation.
    e.g. {'Ghana':[('Pharmacy', 'PHARMACY'), ('Chemist', 'CHEMIST')]}"""
    
    rows=[]
    
    for country in additions.keys():
        new_types = additions[country]
        for new_type in new_types:
            rows.append([country, new_type[0], new_type[1], np.nan])

    new_rows = pd.DataFrame(rows, columns=type_dict_cols)
    print("Number of new rows added:", new_rows.shape[0])
    # Add new rows to type dictionary
    type_dict = pd.concat([type_dict, new_rows])
    type_dict.sort_values(by=['Country', 'Type', 'Abbreviation'], inplace=True)
    
    return type_dict

In [12]:
countries = get_countries(ctr_counts, threshold=100)
print(countries)

['Ghana', 'Democratic Republic of the Congo', 'Ethiopia', 'Kenya', 'Burkina Faso', 'Sierra Leone', 'Nigeria']


In [13]:
results = pd.DataFrame()
for country in countries:
    keyword_dict = gen_word_freq_dict(df=sample, country_col=COUNTRY, 
                                    country_name=country, clean_name_final=CLEAN_NAME_FINAL,
                                       freq_threshold=20)
    
    keywords = list(keyword_dict.keys())
    
    ctr_results = sample_rows(df=sample, country_col=COUNTRY, country_name=country, 
                      words=keywords, sample_size=10, clean_name_final=CLEAN_NAME_FINAL, 
                      cols=cols)
    
    results = pd.concat([results, ctr_results])

In [14]:
results.shape

(150, 4)

In [30]:
results.head()

Unnamed: 0,country,name,clean_name_final,keyword
3435,Ghana,Ashgin Pharmacy,Ashgin Pharmacy,pharmacy
3185,Ghana,Adepaa Pharmacy,Adepaa Pharmacy,pharmacy
3514,Ghana,Manu Yaa pharmacy,Manu Yaa Pharmacy,pharmacy
3836,Ghana,Jinlet Pharmacy,Jinlet Pharmacy,pharmacy
3187,Ghana,Biolink Pharmacy,Biolink Pharmacy,pharmacy


In [16]:
# write additions after manual inspection
additions = {'Ghana':[('Pharmacy', 'PHARMACY'), ('Chemist', 'CHEMIST')]}

In [24]:
new_type_dict = add_rows(additions, type_dict, type_dict_cols)

Number of new rows added: 2


In [25]:
# example of keyword_dict
keyword_dict = gen_word_freq_dict(df=sample, country_col=COUNTRY, 
                                    country_name='Ghana', clean_name_final=CLEAN_NAME_FINAL,
                                 freq_threshold=20)

print(keyword_dict)

{'pharmacy': 320, 'limited': 40, 'chemist': 31}
