### Define the Automatic Type Extraction function

In [5]:
# Copy this code cell to your notebook or python file, no thiry-party library installation required
# Python 3.5+

import re
import unidecode
from collections import Counter 

def deaccent(input_string):
    output_string = unidecode.unidecode(input_string)
    return output_string

def remove_special_characters_and_shrink_whitespace(input_string):
    output_string = re.sub(r'\s+',' ',re.sub(r'[^A-Za-z0-9\s]','',input_string))
    return output_string

def clean_string(input_string):
    output_string = remove_special_characters_and_shrink_whitespace(deaccent(input_string.lower()))
    return output_string

def get_unique_ngram(list_of_ordered_tokens, n):
    # Get the unique ngrams in a string, given a list of ordered tokens in that string
    return set(zip(*[list_of_ordered_tokens[i:i-(n-1)] if i<n-1 else list_of_ordered_tokens[n-1:] for i in range(0,n)]))

def reduce_nested_ngram_freq(x, ngram_freq, mutable_ngram_freq):    
    max_matched_n = 0 # Length of the longest ngram(s) we have matched for this name
    # Note that we have ordered the ngram_freq such that longer ngrams come first, so we can expect to match longest ngram asap
    
    for n, ngram, count in ngram_freq:
        # If the current ngram occurred in more than one name AND the ngram is in the current name
        if count > 1 and ' '+ngram+' ' in ' '+x+' ':
            # If the current ngram is at least the same or even longer than the max n we matched, 
            # just update the max n and move on
            if n >= max_matched_n:
                max_matched_n = n                
            else:
            # Else if the current ngram is shorter than longest ngram we have matched in this name,
            # this probably suggests the current ngram is part of the longest ngram we matched before,
            # thus we need to decrease the frequency count for this shorter ngram as it is not an independent count
                mutable_ngram_freq[ngram] -= 1
                
def extract_types(input_list, thres = None, maximum_expected_number_of_types = 20, return_proportion = False):
    
    name_list = [clean_string(name) for name in list(input_list)]
    number_of_names_in_this_list = len(name_list)

    phrase_counter = Counter()
    # For each name in the name list, tokenize it and count the ngrams in the name, 
    # and add the frequency of ngrams to the phrase counter
    for name in name_list:
        tokens_in_name = name.strip().split(" ")
        for n in range(1,min(len(tokens_in_name),10)+1): # Record frequency of ngrams up to 10 tokens' long
            phrase_counter.update( get_unique_ngram(tokens_in_name, n) )
    
    # For each pair of (token_sequence, count) in the phrase_counter, convert it to (n, ngram, count), where n is the "n" in ngram
    ngram_freq = [(len(token_sequence),' '.join(token_sequence), count) for token_sequence, count in phrase_counter.most_common() if count >= round(len(name_list)*0.001)]
    
    # Sort the ngram_freq list in the reverse order of n, such that longer ngrams is earlier in the list
    ngram_freq = sorted(ngram_freq, key = lambda entry: -entry[0])
  
    # Create a mutable copy of the ngram_freq list
    mutable_ngram_freq = {ngram : count for n, ngram, count in ngram_freq}

    # Iterate through all the names in the name list, use the reduce the frequency count for the ngrams that are part of longer ngram
    for name in name_list:
        reduce_nested_ngram_freq(name, ngram_freq, mutable_ngram_freq)
    
    sorted_consolidated_ngram_freq = sorted(list(mutable_ngram_freq.items()), key = lambda t: -t[-1])

    # If thres is not provided, return the top k types found, with k defined by maximum_expected_number_of_types
    if thres is None:
        if len(sorted_consolidated_ngram_freq)>maximum_expected_number_of_types:
            minimum_count = sorted_consolidated_ngram_freq[maximum_expected_number_of_types+1][-1]
        else:
            minimum_count = 1            
    # If thres is provided, calculate the minimum count by multiplying threshold with number of names in the list
    else:
        minimum_count = max(1, int(thres*number_of_names_in_this_list))    
    sorted_consolidated_ngram_freq = [item for item in sorted_consolidated_ngram_freq if item[-1]>minimum_count]
    
    # If return_proportion is true, return the results in the format (ngram, proportion, count)
    if return_proportion:
        sorted_consolidated_ngram_freq = [(item, round(count/number_of_names_in_this_list,3), count) for item, count in sorted_consolidated_ngram_freq]
    
    return sorted_consolidated_ngram_freq


### Extract Common Phrases from Nature Dataset 

In [2]:
import pandas as pd
import numpy as np
from tabulate import tabulate
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

# Read in the dataset
df = pd.read_csv('sub-saharan_health_facilities.csv')
# Change column names to be more readable
df = df.rename(columns = {'Facility n':'name','Facility t':'given_type'})
# Drop entries whose name field is missing or empty
df['orig_name'] = df['name'].copy()
df['name'] = df['name'].apply(lambda x: np.nan if x.strip() == '' else x)
df = df.dropna(subset=['name']).copy()
df['name'] = df['name'].apply(clean_string)
df['given_type'] = df['given_type'].apply(clean_string)
# Inspect the first 5 rows
df.head()

Unnamed: 0,Country,Admin1,name,given_type,Ownership,Lat,Long,LL source,coordinates,orig_name
0,Angola,Bengo,hospital barra do dande,hospital,Govt.,-8.656,13.4919,Google Earth,"[-8.656, 13.4919]",Hospital Barra Do Dande
1,Angola,Bengo,hospital dos dembos,hospital,Govt.,-8.5026,14.5862,Google Earth,"[-8.5026, 14.5862]",Hospital Dos Dembos
2,Angola,Bengo,hospital municipal de ambriz,municipal hospital,Govt.,-7.8522,13.1307,Google Earth,"[-7.8522, 13.1307]",Hospital Municipal de Ambriz
3,Angola,Bengo,hospital municipal de bula atumba,municipal hospital,Govt.,-8.6742,14.7925,Google Earth,"[-8.6742, 14.7925]",Hospital Municipal de Bula Atumba
4,Angola,Bengo,hospital municipal de dande,municipal hospital,Govt.,-8.5835,13.6569,Google Earth,"[-8.5835, 13.6569]",Hospital Municipal de Dande


### Demonstrate the Results from Common Bottom-up Approach

In [3]:
name_list = df.sample(frac=0.1,random_state=0).loc[:, 'name'].tolist()
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 5))
ngram_sparse_matrix = count_vectorizer.fit_transform(name_list)
vocabulary = count_vectorizer.vocabulary_
ngram_counts = ngram_sparse_matrix.toarray().sum(axis=0)
ngram_freq_with_sklearn = sorted([(k,round(ngram_counts[i]/(len(df)*0.1),3),ngram_counts[i]) for k,i in vocabulary.items()], key = lambda x: -x[-1])
print(tabulate(ngram_freq_with_sklearn[:18], headers=['Type', 'Proportion', 'Count'], tablefmt='fancy_grid'))

╒═══════════════════════╤══════════════╤═════════╕
│ Type                  │   Proportion │   Count │
╞═══════════════════════╪══════════════╪═════════╡
│ centre                │        0.429 │    4235 │
├───────────────────────┼──────────────┼─────────┤
│ health                │        0.412 │    4065 │
├───────────────────────┼──────────────┼─────────┤
│ de                    │        0.306 │    3024 │
├───────────────────────┼──────────────┼─────────┤
│ sante                 │        0.253 │    2494 │
├───────────────────────┼──────────────┼─────────┤
│ de sante              │        0.252 │    2490 │
├───────────────────────┼──────────────┼─────────┤
│ health centre         │        0.224 │    2213 │
├───────────────────────┼──────────────┼─────────┤
│ centre de             │        0.181 │    1792 │
├───────────────────────┼──────────────┼─────────┤
│ centre de sante       │        0.181 │    1790 │
├───────────────────────┼──────────────┼─────────┤
│ clinic                │      

### Run the proposed function on the Nature Dataset

In [4]:
# Get the unique countries in the dataset
list_of_countries = df['Country'].unique().tolist()

# Build a dictionary where entry key is the name of the country, 
# entry value is the list of health facility types in that country
country_and_common_facility_types = {}

# Iterate through all the unique countries
for country_name in list_of_countries:
    
    print('Country: ',country_name,'\n')

    # Get the part of the data that belongs to this country 
    country_part = df.loc[df['Country']== country_name, 'name'].tolist()
    
    common_facility_types_in_this_country = extract_types(country_part, return_proportion = True)
    
    country_and_common_facility_types[country_name] = common_facility_types_in_this_country
    
    print(tabulate(common_facility_types_in_this_country, headers=['Type', 'Proportion', 'Count'], tablefmt='fancy_grid'))
    
    print('\n-------------------------------------------------------------------------------------\n')

Country:  Angola 

╒═════════════════════════════╤══════════════╤═════════╕
│ Type                        │   Proportion │   Count │
╞═════════════════════════════╪══════════════╪═════════╡
│ health post                 │        0.59  │     929 │
├─────────────────────────────┼──────────────┼─────────┤
│ health centre               │        0.121 │     191 │
├─────────────────────────────┼──────────────┼─────────┤
│ municipal hospital          │        0.047 │      74 │
├─────────────────────────────┼──────────────┼─────────┤
│ centro materno infantil     │        0.026 │      41 │
├─────────────────────────────┼──────────────┼─────────┤
│ hospital                    │        0.018 │      28 │
├─────────────────────────────┼──────────────┼─────────┤
│ hospital municipal de       │        0.014 │      22 │
├─────────────────────────────┼──────────────┼─────────┤
│ sede health centre          │        0.008 │      12 │
├─────────────────────────────┼──────────────┼─────────┤
│ 1 health p

╒═══════════════════════════╤══════════════╤═════════╕
│ Type                      │   Proportion │   Count │
╞═══════════════════════════╪══════════════╪═════════╡
│ centre de sante           │        0.705 │     469 │
├───────────────────────────┼──────────────┼─────────┤
│ hopital de district       │        0.069 │      46 │
├───────────────────────────┼──────────────┼─────────┤
│ i centre de sante         │        0.015 │      10 │
├───────────────────────────┼──────────────┼─────────┤
│ ii centre de sante        │        0.011 │       7 │
├───────────────────────────┼──────────────┼─────────┤
│ mubuga centre de sante    │        0.006 │       4 │
├───────────────────────────┼──────────────┼─────────┤
│ nyarunazi centre de sante │        0.006 │       4 │
├───────────────────────────┼──────────────┼─────────┤
│ kivoga centre de sante    │        0.006 │       4 │
├───────────────────────────┼──────────────┼─────────┤
│ rugazi centre de sante    │        0.005 │       3 │
├─────────

╒═══════════════════════════════════════╤══════════════╤═════════╕
│ Type                                  │   Proportion │   Count │
╞═══════════════════════════════════════╪══════════════╪═════════╡
│ centre de sante rural                 │        0.694 │    1244 │
├───────────────────────────────────────┼──────────────┼─────────┤
│ centre de sante urbain                │        0.15  │     269 │
├───────────────────────────────────────┼──────────────┼─────────┤
│ hopital general                       │        0.039 │      70 │
├───────────────────────────────────────┼──────────────┼─────────┤
│ hospitalier regional                  │        0.011 │      19 │
├───────────────────────────────────────┼──────────────┼─────────┤
│ centre medicosocial                   │        0.009 │      17 │
├───────────────────────────────────────┼──────────────┼─────────┤
│ 2 centre de sante rural               │        0.006 │      11 │
├───────────────────────────────────────┼──────────────┼──────

╒═════════════════════════╤══════════════╤═════════╕
│ Type                    │   Proportion │   Count │
╞═════════════════════════╪══════════════╪═════════╡
│ clinic                  │        0.483 │    2520 │
├─────────────────────────┼──────────────┼─────────┤
│ health post             │        0.176 │     917 │
├─────────────────────────┼──────────────┼─────────┤
│ health centre           │        0.138 │     720 │
├─────────────────────────┼──────────────┼─────────┤
│ health station          │        0.026 │     138 │
├─────────────────────────┼──────────────┼─────────┤
│ hospital                │        0.024 │     125 │
├─────────────────────────┼──────────────┼─────────┤
│ arada clinic            │        0.009 │      49 │
├─────────────────────────┼──────────────┼─────────┤
│ bole clinic             │        0.008 │      44 │
├─────────────────────────┼──────────────┼─────────┤
│ kolfe keraniyo clinic   │        0.008 │      43 │
├─────────────────────────┼──────────────┼────

╒═════════════════════════════╤══════════════╤═════════╕
│ Type                        │   Proportion │   Count │
╞═════════════════════════════╪══════════════╪═════════╡
│ poste de sante              │        0.604 │    1055 │
├─────────────────────────────┼──────────────┼─────────┤
│ centre de sante             │        0.228 │     398 │
├─────────────────────────────┼──────────────┼─────────┤
│ hopital prefectoral         │        0.014 │      25 │
├─────────────────────────────┼──────────────┼─────────┤
│ hopital regional            │        0.004 │       7 │
├─────────────────────────────┼──────────────┼─────────┤
│ dar es salam poste de sante │        0.003 │       6 │
├─────────────────────────────┼──────────────┼─────────┤
│ balandougou poste de sante  │        0.003 │       5 │
├─────────────────────────────┼──────────────┼─────────┤
│ hamdallaye poste de sante   │        0.003 │       5 │
├─────────────────────────────┼──────────────┼─────────┤
│ missira poste de sante      │

╒══════════════════════════════════╤══════════════╤═════════╕
│ Type                             │   Proportion │   Count │
╞══════════════════════════════════╪══════════════╪═════════╡
│ community health centre          │        0.767 │    1133 │
├──────────────────────────────────┼──────────────┼─────────┤
│ referral health centre           │        0.041 │      61 │
├──────────────────────────────────┼──────────────┼─────────┤
│ clinique                         │        0.04  │      59 │
├──────────────────────────────────┼──────────────┼─────────┤
│ central community health centre  │        0.024 │      35 │
├──────────────────────────────────┼──────────────┼─────────┤
│ medicale clinique                │        0.009 │      14 │
├──────────────────────────────────┼──────────────┼─────────┤
│ hopital                          │        0.007 │      11 │
├──────────────────────────────────┼──────────────┼─────────┤
│ polyclinique                     │        0.006 │       9 │
├───────

╒════════════════════════════════╤══════════════╤═════════╕
│ Type                           │   Proportion │   Count │
╞════════════════════════════════╪══════════════╪═════════╡
│ health hut                     │        0.605 │    1747 │
├────────────────────────────────┼──────────────┼─────────┤
│ integrated health centre       │        0.276 │     797 │
├────────────────────────────────┼──────────────┼─────────┤
│ hospital                       │        0.014 │      41 │
├────────────────────────────────┼──────────────┼─────────┤
│ koira health hut               │        0.006 │      16 │
├────────────────────────────────┼──────────────┼─────────┤
│ beri health hut                │        0.005 │      15 │
├────────────────────────────────┼──────────────┼─────────┤
│ peulh health hut               │        0.004 │      12 │
├────────────────────────────────┼──────────────┼─────────┤
│ saboua health hut              │        0.004 │      11 │
├────────────────────────────────┼──────

╒════════════════════════════╤══════════════╤═════════╕
│ Type                       │   Proportion │   Count │
╞════════════════════════════╪══════════════╪═════════╡
│ poste de sante             │        0.656 │     884 │
├────────────────────────────┼──────────────┼─────────┤
│ centre de sante            │        0.063 │      85 │
├────────────────────────────┼──────────────┼─────────┤
│ hopital regional           │        0.01  │      13 │
├────────────────────────────┼──────────────┼─────────┤
│ ii poste de sante          │        0.007 │      10 │
├────────────────────────────┼──────────────┼─────────┤
│ sud poste de sante         │        0.006 │       8 │
├────────────────────────────┼──────────────┼─────────┤
│ darou salam poste de sante │        0.005 │       7 │
├────────────────────────────┼──────────────┼─────────┤
│ i poste de sante           │        0.005 │       7 │
├────────────────────────────┼──────────────┼─────────┤
│ hopital                    │        0.005 │   

╒═══════════════════╤══════════════╤═════════╕
│ Type              │   Proportion │   Count │
╞═══════════════════╪══════════════╪═════════╡
│ clinic            │        0.689 │    2965 │
├───────────────────┼──────────────┼─────────┤
│ hospital          │        0.073 │     313 │
├───────────────────┼──────────────┼─────────┤
│ chc               │        0.065 │     279 │
├───────────────────┼──────────────┼─────────┤
│ satellite clinic  │        0.046 │     200 │
├───────────────────┼──────────────┼─────────┤
│ gateway clinic    │        0.029 │     125 │
├───────────────────┼──────────────┼─────────┤
│ park clinic       │        0.008 │      35 │
├───────────────────┼──────────────┼─────────┤
│ street clinic     │        0.008 │      34 │
├───────────────────┼──────────────┼─────────┤
│ health post       │        0.008 │      33 │
├───────────────────┼──────────────┼─────────┤
│ st                │        0.005 │      22 │
├───────────────────┼──────────────┼─────────┤
│ 2 clinic   

╒══════════════════════════════════╤══════════════╤═════════╕
│ Type                             │   Proportion │   Count │
╞══════════════════════════════════╪══════════════╪═════════╡
│ health centre ii                 │        0.557 │    2114 │
├──────────────────────────────────┼──────────────┼─────────┤
│ health centre iii                │        0.314 │    1192 │
├──────────────────────────────────┼──────────────┼─────────┤
│ health centre iv                 │        0.05  │     188 │
├──────────────────────────────────┼──────────────┼─────────┤
│ hospital                         │        0.029 │     109 │
├──────────────────────────────────┼──────────────┼─────────┤
│ medical centre health centre ii  │        0.004 │      16 │
├──────────────────────────────────┼──────────────┼─────────┤
│ ngo health centre ii             │        0.004 │      14 │
├──────────────────────────────────┼──────────────┼─────────┤
│ medical centre health centre iii │        0.003 │      13 │
├───────

In [None]:
# Format the results from the common phrases finding process to a DataFrame
result_df = pd.DataFrame(country_and_common_facility_types.items(),columns=['Country','Type']).explode('Type')
result_df[['Type','Proportion within Country','Count']] = pd.DataFrame(result_df['Type'].tolist(), index = result_df.index)
result_df = result_df.sort_values(['Country','Proportion within Country'], ascending=[True, False]).reset_index(drop=True)
result_df.head(20)

During the Google Search process, take notes of the special abbreviation:

- RCH : Reproductive Child Health
- GFPA : Gambia Family Planning Association
- AIC : Africa Inland Church
- SDA : Seventh Day Adventist
- PHC : Primary Health Care
- FOSACOM : Formation Sanitaire Communautaire
- OPD : Out-Patient Department
- SRCS : Somali Red Crescent Society
- CHC : Community Health Center
- RC : Red Cross
- ACK : Anglican Church of Kenya
- UVS : Unites Villageoises de Sante