In [69]:
import pandas as pd
import numpy as np
import re
import unidecode
from collections import Counter 

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

### Extract Common Phrases from Nature Dataset 

In [3]:
# Read in the dataset
df = pd.read_csv('/Users/timsmac/Desktop/GRID3/B_semi_data/sub-saharan_health_facilities.csv')
df.head()

Unnamed: 0,Country,Admin1,Facility n,Facility t,Ownership,Lat,Long,LL source,coordinates
0,Angola,Bengo,Hospital Barra Do Dande,Hospital,Govt.,-8.656,13.4919,Google Earth,"[-8.656, 13.4919]"
1,Angola,Bengo,Hospital Dos Dembos,Hospital,Govt.,-8.5026,14.5862,Google Earth,"[-8.5026, 14.5862]"
2,Angola,Bengo,Hospital Municipal de Ambriz,Municipal Hospital,Govt.,-7.8522,13.1307,Google Earth,"[-7.8522, 13.1307]"
3,Angola,Bengo,Hospital Municipal de Bula Atumba,Municipal Hospital,Govt.,-8.6742,14.7925,Google Earth,"[-8.6742, 14.7925]"
4,Angola,Bengo,Hospital Municipal de Dande,Municipal Hospital,Govt.,-8.5835,13.6569,Google Earth,"[-8.5835, 13.6569]"


In [4]:
def deaccent(accented_string):
    return unidecode.unidecode(accented_string)

def remove_special_characters_and_shrink_whitespace(input_string):
    output_string = re.sub(r'\s+',' ',re.sub(r'[^A-Za-z0-9\s]','',input_string))
    return output_string

# Change column names to be more readable
df = df.rename(columns = {'Facility n':'name','Facility t':'Type'})
# Drop entries whose name field is missing or empty
df['name'] = df['name'].apply(lambda x: np.nan if x.strip() == '' else x)
df = df.dropna(subset=['name']).copy()
# De-accent the name field
df['deaccented_name'] = df['name'].apply(deaccent)
# Remove the special characters in the name field and reduce multiple whitespace to single one
df['special_char_removed_name'] = df['deaccented_name'].apply(remove_special_characters_and_shrink_whitespace)
# Make the name field lowercase
df['lowered_cleaned_name'] = df['special_char_removed_name'].apply(str.lower)

In [5]:
def get_unique_ngram(list_of_ordered_tokens, n):
    # Get the unique ngrams in a string, given a list of ordered tokens in that string
    return set(zip(*[list_of_ordered_tokens[i:i-(n-1)] if i<n-1 else list_of_ordered_tokens[n-1:] for i in range(0,n)]))

def reduce_nested_ngram_freq(x):    
    max_matched_n = 0 # Length of the longest ngram(s) we have matched for this name
    # Note that we have ordered the ngram_freq such that longer ngrams come first, so we can expect to match longest ngram asap
    
    for n, ngram, count in ngram_freq:
        # If the current ngram occurred in more than one name AND the ngram is in the current name
        if (count > 1) and (' '+ngram+' ' in ' '+x+' '):
            # If the current ngram is at least the same or even longer than the max n we matched, 
            # just update the max n and move on
            if n >= max_matched_n:
                max_matched_n = n                
            else:
            # Else if the current ngram is shorter than longest ngram we have matched in this name,
            # this probably suggests the current ngram is part of the longest ngram we matched before,
            # thus we need to decrease the frequency count for this shorter ngram as it is not an independent count
                mutable_ngram_freq[ngram] -= 1


In [6]:
# Get the unique countries in the dataset
list_of_countries = df['Country'].unique().tolist()

# Build a dictionary where entry key is the name of the country, 
# entry value is the list of health facility types in that country
country_and_common_facility_types = {}

# Iterate through all the unique countries
for country_name in list_of_countries:
    
    print('Country: ',country_name,'\n')

    # Get the part of the data that belongs to this country 
    country_part = df[df['Country']== country_name].copy()
    
    # Get the total number of health facilities in this country 
    total_facility_count_in_this_country = len(country_part)

    # Get the list of names of the facilities in this country
    name_list = country_part.lowered_cleaned_name.tolist()

    #------------------------------------------------------------------------------------------#
    phrase_counter = Counter()
    # For each name in the name list, tokenize it and count the ngrams in the name, 
    # and add the frequency of ngrams to the phrase counter
    for name in name_list:
        tokens_in_name = name.strip().split(" ")
        for n in range(1,10+1): # Record frequency of ngrams up to 10 tokens' long
            phrase_counter.update( get_unique_ngram(tokens_in_name, n) )
    
    # For each pair of (token_sequence, count) in the phrase_counter, convert it to (n, ngram, count), where n is the "n" in ngram
    ngram_freq = [(len(token_sequence),' '.join(token_sequence), count) for token_sequence, count in phrase_counter.most_common() if count >= round(len(name_list)*0.001)]
    #------------------------------------------------------------------------------------------#
    
    # Sort the ngram_freq list in the reverse order of n, such that longer ngrams is earlier in the list
    ngram_freq = sorted(ngram_freq, key = lambda entry: -entry[0])
  
    # Create a mutable copy of the ngram_freq list
    mutable_ngram_freq = {ngram : count for n, ngram, count in ngram_freq}

    # Iterate through all the names in the name list, use the reduce the frequency count for the ngrams that are part of longer ngram
    for name in name_list:
        reduce_nested_ngram_freq(name)
    
    common_facility_types_in_this_country = []
    minimum_proportion = 0.005
    for ngram, count in mutable_ngram_freq.items():
        # In some cases, there is only one instance of a special kind of health facility in a country. 
        # The criteria here will not capture that, thus it needs to be rediscovered in post processing.
        proportion_in_country = round(count/total_facility_count_in_this_country, 3)
        if count > 1 and proportion_in_country > minimum_proportion: 
            common_facility_types_in_this_country.append((ngram, proportion_in_country, count))

    # Order the list of most frequently occuring facility types by order of their proportion in country, highest comes first
    common_facility_types_in_this_country = sorted(common_facility_types_in_this_country, key = lambda x: -x[1])

    country_and_common_facility_types[country_name] = common_facility_types_in_this_country
    
    print(common_facility_types_in_this_country)
    
    print('----------------------------------\n')

Country:  Angola 

[('health post', 0.59, 929), ('health centre', 0.121, 191), ('municipal hospital', 0.047, 74), ('centro materno infantil', 0.026, 41), ('hospital', 0.018, 28), ('hospital municipal de', 0.014, 22), ('sede health centre', 0.008, 12)]
----------------------------------

Country:  Benin 

[('centre de sante darrondissement', 0.366, 300), ('centre de sante', 0.295, 242), ('centre communal de sante', 0.161, 132), ('dispensaire', 0.031, 25), ('centre medical', 0.027, 22), ('hopital de zone', 0.026, 21), ('hopital', 0.022, 18), ('uvs unites de sante de village', 0.011, 9), ('centre de sante central', 0.011, 9), ('centre de sante de sousprefecture', 0.01, 8), ('centre hospitalier departemental', 0.006, 5)]
----------------------------------

Country:  Botswana 

[('health post', 0.513, 320), ('clinic', 0.357, 223), ('primary hospital', 0.027, 17), ('district hospital', 0.011, 7)]
----------------------------------

Country:  Burkina Faso 

[('centre de sante et de promotion 

[('poste de sante', 0.604, 1055), ('centre de sante', 0.228, 398), ('hopital prefectoral', 0.014, 25)]
----------------------------------

Country:  Guinea Bissau 

[('hospital regional de', 0.375, 3), ('hospital', 0.375, 3), ('hospital de', 0.25, 2), ('de', 0.25, 2)]
----------------------------------

Country:  Kenya 

[('dispensary', 0.637, 3915), ('health centre', 0.154, 947), ('clinic', 0.027, 167), ('district hospital', 0.019, 118), ('subdistrict hospital', 0.017, 105), ('catholic dispensary', 0.016, 100), ('hospital', 0.013, 82), ('st', 0.011, 66), ('ack dispensary', 0.008, 52), ('community dispensary', 0.007, 46), ('aic dispensary', 0.007, 40), ('mission dispensary', 0.006, 37), ('mission hospital', 0.006, 36)]
----------------------------------

Country:  Lesotho 

[('health centre', 0.769, 90), ('hospital', 0.154, 18), ('st theresa health centre', 0.026, 3), ('sda health centre', 0.017, 2), ('filter clinic', 0.017, 2), ('st josephs', 0.017, 2), ('st james', 0.017, 2)]
-------

[('dispensary', 0.815, 5138), ('health centre', 0.106, 669), ('hospital', 0.013, 81), ('district hospital', 0.01, 63), ('rc dispensary', 0.008, 50), ('designated district hospital', 0.006, 36), ('mission dispensary', 0.006, 37), ('st', 0.006, 38)]
----------------------------------

Country:  Togo 

[('unite de soins peripherique', 0.541, 112), ('centre medicosocial', 0.246, 51), ('centre hospitalier prefectoral', 0.145, 30), ('centre hospitalier regional', 0.024, 5), ('tomegbe unite de soins peripherique', 0.01, 2), ('tove unite de soins peripherique', 0.01, 2), ('cope unite de soins peripherique', 0.01, 2), ('university centre hospitalier universitaire', 0.01, 2), ('sokode centre hospitalier', 0.01, 2), ('atakpame centre hospitalier', 0.01, 2)]
----------------------------------

Country:  Uganda 

[('health centre ii', 0.557, 2114), ('health centre iii', 0.314, 1192), ('health centre iv', 0.05, 188), ('hospital', 0.029, 109)]
----------------------------------

Country:  Zambia 

[(

In [7]:
# Format the results from the common phrases finding process to a DataFrame
result_df = pd.DataFrame(country_and_common_facility_types.items(),columns=['Country','Type'])
result_df = result_df.explode('Type')
result_df[['Type','Proportion within Country','Count']] = pd.DataFrame(result_df['Type'].tolist(), index = result_df.index)
result_df.Country = result_df.Country.apply(str.lower)
result_df.Type = result_df.Type.apply(str.lower)
result_df = result_df.sort_values(['Country','Proportion within Country'], ascending=[True, False]).reset_index(drop=True)

In [8]:
result_df.head(10)

Unnamed: 0,Country,Type,Proportion within Country,Count
0,angola,health post,0.59,929
1,angola,health centre,0.121,191
2,angola,municipal hospital,0.047,74
3,angola,centro materno infantil,0.026,41
4,angola,hospital,0.018,28
5,angola,hospital municipal de,0.014,22
6,angola,sede health centre,0.008,12
7,benin,centre de sante darrondissement,0.366,300
8,benin,centre de sante,0.295,242
9,benin,centre communal de sante,0.161,132


### Computer aided correction

In [12]:
# From the result above, we can see that common phrases include tokens that might not 
# be description health facility type, but rather places where the facilities are at

# One solution is to check if the first token in the matched type is a place in that country
# or if the token has some speical meaning...

result_df['first_token'] = result_df['Type'].apply(lambda x: x.split()[0])

# To narrow the scope we need to check, first identify some tokens that are obviously part of facility type:
tokens_that_are_part_of_facility_type = ['health', 'centre', 'hospital', 'clinic', 'hopital', 'district', 'postos', 'st', 'community', 'centro', 'primary', 'ii', 'mission', 'rural', 'poste', '2', 'dispensaire', '1', 'dispensary', 'general', 'central', 'hospitais', 'government', 'provincial', 'i', 'basic', 'mini', 'town', 'polyclinique', 'regional', 'clinique', 'hospitalier', 'urbain', 'public', 'de', 'referral', 'new', 'maternal', 'catholique'] # print(result_df['first_token'].value_counts()[result_df['first_token'].value_counts()>1].index.tolist())

# The dataframe below contains the facility types that might not be a valid facility type.
potential_places_as_prefix = result_df[~result_df['first_token'].isin(tokens_that_are_part_of_facility_type)].copy()

In [None]:
from scraper_functions import *

In [None]:
# Automate the process of searching these token on Google, record our decision when seeing the search results

def ask_google(list_of_search_keyword, input_guide):
    answers = []
    for search_keyword in list_of_search_keyword:
        go_to_page(driver, 'https://www.google.com/search?q='+search_keyword)
        answers.append(input(input_guide))
        print('\n')
    return answers

list_of_search_keyword = (potential_places_as_prefix['Country'] + '+' + potential_places_as_prefix['first_token']).to_list()

answers = ask_google(list_of_search_keyword, 'Is this a place or place type or facility or unable to tell? (p/t/f/u)')
potential_places_as_prefix['category_of_first_token'] = answers

In [13]:
# # The recorded answers from previous run:
potential_places_as_prefix['category_of_first_token'] = list('ttfpppfttutuffuptuppupppufpppppuppppuuuutptpppptppppffpufupppppppufftfpfptuffffffuffftfttftftffuppppufpfffpffffppppppppppfpppppppppppfpppptpffpfffuutupupptfuufpuupppuupfuuupppfffppufppufppppp')

During the Google Search process, take notes of the special abbreviation:

- RCH : Reproductive Child Health
- GFPA : Gambia Family Planning Association
- AIC : Africa Inland Church
- SDA : Seventh Day Adventist
- PHC : Primary Health Care
- FOSACOM : Formation Sanitaire Communautaire
- OPD : Out-Patient Department
- SRCS : Somali Red Crescent Society
- CHC : Community Health Center
- RC : Red Cross
- ACK : Anglican Church of Kenya
- UVS : Unites Villageoises de Sante

In [14]:
# # Inspect the results from our Google Search process
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 'p']
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 't']
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 'f']
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 'u']

In [15]:
# Map the category we recorded back to the main result dataframe
token_to_category_mapping = potential_places_as_prefix.set_index('first_token')['category_of_first_token'].to_dict()
result_df['category_of_first_token'] = result_df['first_token'].apply(lambda x: token_to_category_mapping.get(x, 'f'))
result_df['category_of_first_token'].value_counts()

f    341
p     90
u     33
t     19
Name: category_of_first_token, dtype: int64

In [16]:
# Identify the facility types that are common enough to be considered new facility types
new_facility_types = ['municipal hospital', 'sede health centre', 'vaz postos sanitarios', 'pedro postos sanitarios', 'est centre de sante', 'gare centre de sante integre', 'adi health station', 'sud health centre', 'area health centre', 'sud poste de sante', 'county hospital'] # print(result_df.loc[result_df['category_of_first_token']=='t','Type'].tolist())
result_df.loc[result_df['Type'].isin(new_facility_types), 'category_of_first_token'] = 'f'
result_df.loc[result_df['Type'].apply(lambda x: x.isnumeric()), 'category_of_first_token'] = 'u'
result_df = result_df[~result_df['category_of_first_token'].isin(['p', 't', 'u'])].reset_index(drop=True).drop(['first_token','category_of_first_token'], axis=1)

In [17]:
result_df.to_csv('../B_semi_data/type_dict_tim.csv',index=False)

In [18]:
result_df['Country-Type'] = result_df['Country'] + ' - ' +result_df['Type']

### Compare extracted types with type_dict_augmented_1130

In [19]:
type_dict_1130 = pd.read_csv('../github_downloaded_datasets/type_dict_augmented_1130.csv')
type_dict_1130.Country = type_dict_1130.Country.apply(str.lower)
type_dict_1130.Type = type_dict_1130.Type.apply(str.lower)
type_dict_1130 = type_dict_1130.sort_values(['Country','Type']).reset_index(drop=True)
type_dict_1130['Country-Type'] = type_dict_1130['Country'] + ' - ' +type_dict_1130['Type']

In [20]:
assert(sorted(result_df.Country.unique().tolist()) == sorted(type_dict_1130.Country.unique().tolist()))

In [23]:
agreed_country_types = result_df[result_df['Country-Type'].isin(type_dict_1130['Country-Type'].tolist())].reset_index(drop=True)

agreed_country_types

Unnamed: 0,Country,Type,Proportion within Country,Count,Country-Type
0,angola,municipal hospital,0.047,74,angola - municipal hospital
1,angola,centro materno infantil,0.026,41,angola - centro materno infantil
2,angola,hospital,0.018,28,angola - hospital
3,benin,centre de sante,0.295,242,benin - centre de sante
4,benin,centre medical,0.027,22,benin - centre medical
...,...,...,...,...,...
163,zimbabwe,clinic,0.445,550,zimbabwe - clinic
164,zimbabwe,rural health clinic,0.400,495,zimbabwe - rural health clinic
165,zimbabwe,rural hospital,0.084,104,zimbabwe - rural hospital
166,zimbabwe,district hospital,0.038,47,zimbabwe - district hospital


In [24]:
unique_to_type_dict_1130 = type_dict_1130[~type_dict_1130['Country-Type'].isin(result_df['Country-Type'].tolist())].reset_index(drop=True)

unique_to_type_dict_1130

Unnamed: 0,Country,Type,Abbreviation,count,Country-Type
0,angola,central hospital,CH,3.0,angola - central hospital
1,angola,centro de saude,CS,231.0,angola - centro de saude
2,angola,centro sanatorio materno infantil,CSMI,3.0,angola - centro sanatorio materno infantil
3,angola,general hospital,GH,3.0,angola - general hospital
4,angola,posto de saude,PS,1152.0,angola - posto de saude
...,...,...,...,...,...
243,zambia,universty teaching hospital,UTH,,zambia - universty teaching hospital
244,zanzibar,primary health care unit plus,PHCUP,28.0,zanzibar - primary health care unit plus
245,zanzibar,tertiary hospital,TH,1.0,zanzibar - tertiary hospital
246,zimbabwe,central hospital,CH,6.0,zimbabwe - central hospital


In [25]:
df[(df.Country=='angola')&(df.lowered_cleaned_name.str.contains('centro de saude'))]

Unnamed: 0,Country,Admin1,name,Type,Ownership,Lat,Long,LL source,coordinates,deaccented_name,special_char_removed_name,lowered_cleaned_name


In [26]:
unique_to_result_df = result_df[~result_df['Country-Type'].isin(type_dict_1130['Country-Type'].tolist())].reset_index(drop=True)

unique_to_result_df

Unnamed: 0,Country,Type,Proportion within Country,Count,Country-Type
0,angola,health post,0.590,929,angola - health post
1,angola,health centre,0.121,191,angola - health centre
2,angola,hospital municipal de,0.014,22,angola - hospital municipal de
3,angola,sede health centre,0.008,12,angola - sede health centre
4,benin,centre de sante darrondissement,0.366,300,benin - centre de sante darrondissement
...,...,...,...,...,...
176,zambia,district level 1 hospital,0.022,28,zambia - district level 1 hospital
177,zambia,mission level 1 hospital,0.019,24,zambia - mission level 1 hospital
178,zambia,general level 2 hospital,0.006,8,zambia - general level 2 hospital
179,zambia,urban clinic clinic,0.006,8,zambia - urban clinic clinic


### Compare extracted types with the types that comes with nature dataset

In [27]:
country_to_facility_types_mapping = result_df.groupby('Country').agg({'Type':list}).to_dict()['Type']

In [29]:
def get_matched_type_and_pure_name(country_name, name):

    facility_types = country_to_facility_types_mapping[country_name]

    facility_types = sorted(facility_types, key = lambda x: -len(x.split()))

    matched_type = np.nan
    for facility_type in facility_types:
        if ' '+facility_type+' ' in ' '+name+' ':
            matched_type = facility_type
            break

    if isinstance(matched_type,str):
        pure_name = name.replace(matched_type,'').strip()
    else:
        pure_name = np.nan
        
    return (matched_type,pure_name)

In [47]:
df['Type'] = df['Type'].apply(str.lower).apply(deaccent)

df[['matched_type','pure_name']] = pd.DataFrame(df[['Country','lowered_cleaned_name']].apply(lambda row: get_matched_type_and_pure_name(row['Country'].lower(), row['lowered_cleaned_name']), axis=1).tolist(), index= df.index)

In [48]:
print('\n'+str(np.round(df.matched_type.notnull().mean()*100,2))+'% of the nature dataset is covered by the extracted types.')

print('\nFor '+str(np.round(sum(df.Type==df.matched_type)/len(df)*100,2))+'% of the nature dataset, provided type and extracted type are exactly the same.')

different_type = df.loc[(df.Type!=df.matched_type) & df.Type.notnull() & df.matched_type.notnull(), ['Country','name','Type','matched_type']]

print('\nFor '+str(np.round(len(different_type)/len(df)*100,2))+'% of the nature dataset, the provided type and extracted type are different. Below are some examples:\n')

different_type.drop_duplicates(subset=['Type','matched_type']).rename(columns={'Type':'original_type'})[:20]


98.57% of the nature dataset is covered by the extracted types.

For 84.09% of the nature dataset, provided type and extracted type are exactly the same.

For 14.48% of the nature dataset, the provided type and extracted type are different. Below are some examples:



Unnamed: 0,Country,name,original_type,matched_type
2,Angola,Hospital Municipal de Ambriz,municipal hospital,hospital municipal de
8,Angola,Hospital Provincial de Bengo,provincial hospital,hospital
9,Angola,27 de Marto Health Post,posto de saude,health post
10,Angola,Abrigo Anjo da Guarda Health Centre,centro de saude,health centre
29,Angola,Bela Vistal Centro Materno Infantil,centro de saude,centro materno infantil
35,Angola,Benguela/Lobito Hospital,central hospital,hospital
105,Angola,Dombe Grande Communal Hospital,municipal hospital,hospital
366,Angola,Boa Entrada Regional Hospital,regional hospital,hospital
447,Angola,Hospital Geral 17 de Setembro,general hospital,hospital
526,Angola,Sede Health Centre,centro de saude,sede health centre
