In [1]:
import pandas as pd
import numpy as np
import re
import unidecode
from collections import Counter 

import reverse_geocoder
import pycountry

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

### Extract Common Phrases from Nature Dataset 

In [3]:
# Read in the dataset   # $ updated
df = pd.read_csv('/Users/timsmac/Desktop/GRID3/B_semi_data/healthsites_world_african_part.csv',low_memory=False)
df.head()

Unnamed: 0,osm_id,amenity,healthcare,name,operator,source,speciality,operator_ty,contact_num,operational,opening_hou,beds,staff_docto,staff_nurse,health_amen,dispensing,wheelchair,emergency,insurance,water_sourc,electricity,is_in_healt,is_in_healt_1,url,addr_housen,addr_street,addr_postco,addr_city,changeset_i,changeset_v,changeset_t,changeset_u,coordinates,within_africa
0,1433615056,pharmacy,,dis-chem,,,,,,,,,,,,,,,,,,,,,,,,,9308758,1,2011-09-15 16:26:54,Chris-Jan,"[-26.267759, 28.051121]",True
1,1433904505,pharmacy,,,,,,,,,,,,,,yes,,,,,,,,,,,,,9311970,1,2011-09-15 21:34:03,Javier Sanchez,"[28.427018, -16.298639]",True
2,1434317784,pharmacy,,clicks,,,,,,,,,,,,,,,,,,,,,,,,,9316364,1,2011-09-16 13:55:27,Chris-Jan,"[-26.179756, 28.117323]",True
3,1434394492,pharmacy,,clicks,,,,,,,,,,,,,,,,,,,,,,,,,9316364,1,2011-09-16 14:43:25,Chris-Jan,"[-26.188383, 28.123669]",True
4,1435184422,pharmacy,,Kenema Pharmacy,,,,,,,,,,,,yes,,,,,,,,,,,,,11114428,2,2012-03-27 09:38:52,AddisMap-Surafel,"[9.014315, 38.757292]",True


In [4]:
def deaccent(accented_string):
    return unidecode.unidecode(accented_string)

def remove_special_characters_and_shrink_whitespace(input_string):
    output_string = re.sub(r'\s+',' ',re.sub(r'[^A-Za-z0-9\s]','',input_string))
    return output_string

In [5]:
# $ updated

df.coordinates = df.coordinates.apply(eval)

def get_country_given_coordinates(coord): 
    coord = [tuple(li) for li in coord]
    results = reverse_geocoder.search(coord)
    return [pycountry.countries.get(alpha_2 = res['cc']).name for res in results]

In [6]:
df['Country'] = get_country_given_coordinates(df.coordinates.tolist())

Loading formatted geocoded file...


In [7]:
df.Country = df.Country.apply(deaccent).apply(str.lower)

In [8]:
african_countries = pd.read_csv('type_dict_tim.csv')['Country'].unique().tolist()

In [9]:
df = df[df.Country.isin(african_countries)].reset_index(drop=True)

In [27]:
df = df.sort_values('Country').reset_index(drop=True)

In [10]:
# Change column names to be more readable
df = df.rename(columns = {'name':'name','amenity':'Type'}) # $ updated
# Drop entries whose name field is missing or empty
df['name'] = df['name'].apply(lambda x: np.nan if isinstance(x,str) and x.strip() == '' else x)  # $ updated
df = df.dropna(subset=['name']).copy()
# De-accent the name field
df['deaccented_name'] = df['name'].apply(deaccent)
# Remove the special characters in the name field and reduce multiple whitespace to single one
df['special_char_removed_name'] = df['deaccented_name'].apply(remove_special_characters_and_shrink_whitespace)
# Make the name field lowercase
df['lowered_cleaned_name'] = df['special_char_removed_name'].apply(str.lower)

In [11]:
def get_unique_ngram(list_of_ordered_tokens, n):
    # Get the unique ngrams in a string, given a list of ordered tokens in that string
    return set(zip(*[list_of_ordered_tokens[i:i-(n-1)] if i<n-1 else list_of_ordered_tokens[n-1:] for i in range(0,n)]))

def reduce_nested_ngram_freq(x):    
    max_matched_n = 0 # Length of the longest ngram(s) we have matched for this name
    # Note that we have ordered the ngram_freq such that longer ngrams come first, so we can expect to match longest ngram asap
    
    for n, ngram, count in ngram_freq:
        # If the current ngram occurred in more than one name AND the ngram is in the current name
        if (count > 1) and (' '+ngram+' ' in ' '+x+' '):
            # If the current ngram is at least the same or even longer than the max n we matched, 
            # just update the max n and move on
            if n >= max_matched_n:
                max_matched_n = n                
            else:
            # Else if the current ngram is shorter than longest ngram we have matched in this name,
            # this probably suggests the current ngram is part of the longest ngram we matched before,
            # thus we need to decrease the frequency count for this shorter ngram as it is not an independent count
                mutable_ngram_freq[ngram] -= 1


In [37]:
# Get the unique countries in the dataset
list_of_countries = df['Country'].unique().tolist()

# Build a dictionary where entry key is the name of the country, 
# entry value is the list of health facility types in that country
country_and_common_facility_types = {}

# Iterate through all the unique countries
for country_name in list_of_countries:
    
    print('Country: ',country_name,'\n')

    # Get the part of the data that belongs to this country 
    country_part = df[df['Country']== country_name].copy()
    
    # Get the total number of health facilities in this country 
    total_facility_count_in_this_country = len(country_part)

    # Get the list of names of the facilities in this country
    name_list = country_part.lowered_cleaned_name.tolist()

    #------------------------------------------------------------------------------------------#
    phrase_counter = Counter()
    # For each name in the name list, tokenize it and count the ngrams in the name, 
    # and add the frequency of ngrams to the phrase counter
    for name in name_list:
        tokens_in_name = name.strip().split(" ")
        for n in range(1,10+1): # Record frequency of ngrams up to 10 tokens' long
            phrase_counter.update( get_unique_ngram(tokens_in_name, n) )
    
    # For each pair of (token_sequence, count) in the phrase_counter, convert it to (n, ngram, count), where n is the "n" in ngram
    ngram_freq = [(len(token_sequence),' '.join(token_sequence), count) for token_sequence, count in phrase_counter.most_common() if count >= round(len(name_list)*0.001)]
    #------------------------------------------------------------------------------------------#
    
    # Sort the ngram_freq list in the reverse order of n, such that longer ngrams is earlier in the list
    ngram_freq = sorted(ngram_freq, key = lambda entry: -entry[0])
  
    # Create a mutable copy of the ngram_freq list
    mutable_ngram_freq = {ngram : count for n, ngram, count in ngram_freq}

    # Iterate through all the names in the name list, use the reduce the frequency count for the ngrams that are part of longer ngram
    for name in name_list:
        reduce_nested_ngram_freq(name)
    
    common_facility_types_in_this_country = []
    minimum_proportion = 0.009
    for ngram, count in mutable_ngram_freq.items():
        # In some cases, there is only one instance of a special kind of health facility in a country. 
        # The criteria here will not capture that, thus it needs to be rediscovered in post processing.
        proportion_in_country = round(count/total_facility_count_in_this_country, 3)
        if count > 1 and proportion_in_country > minimum_proportion: 
            common_facility_types_in_this_country.append((ngram, proportion_in_country, count))

    # Order the list of most frequently occuring facility types by order of their proportion in country, highest comes first
    common_facility_types_in_this_country = sorted(common_facility_types_in_this_country, key = lambda x: -x[1])

    country_and_common_facility_types[country_name] = common_facility_types_in_this_country
    
    print(common_facility_types_in_this_country)
    
    print('----------------------------------\n')

Country:  angola 

[('farmacia', 0.451, 134), ('centro medico', 0.054, 16), ('clinica', 0.047, 14), ('consultorio medico', 0.024, 7), ('ii', 0.017, 5), ('centro de saude', 0.01, 3), ('deposito de medicamentos', 0.01, 3), ('hospital', 0.01, 3), ('mecofarma', 0.01, 3), ('das', 0.01, 3)]
----------------------------------

Country:  benin 

[('pharmacie', 0.196, 84), ('clinique', 0.056, 24), ('centre de sante', 0.049, 21), ('centre de sante de', 0.042, 18), ('ong', 0.028, 12), ('centre medicosocial', 0.023, 10), ('de', 0.023, 10), ('pharmacie de', 0.021, 9), ('centre', 0.016, 7), ('sante', 0.016, 7), ('clinique saint', 0.014, 6), ('pharmacie le', 0.014, 6), ('saint', 0.014, 6), ('centre medical', 0.012, 5), ('cabinet dentaire', 0.012, 5), ('clinique st', 0.012, 5), ('benin', 0.012, 5)]
----------------------------------

Country:  botswana 

[('clinic', 0.438, 42), ('health post', 0.104, 10), ('hospital', 0.104, 10), ('pharmacy', 0.062, 6), ('health clinic', 0.042, 4), ('medical centre', 

[('dispensary', 0.14, 129), ('chemist', 0.139, 128), ('medical clinic', 0.07, 65), ('clinic', 0.064, 59), ('pharmacy', 0.04, 37), ('health centre', 0.034, 31), ('medical centre', 0.023, 21), ('health center', 0.021, 19), ('hospital', 0.021, 19), ('medical center', 0.014, 13), ('centre', 0.012, 11), ('medical', 0.011, 10), ('medical services', 0.01, 9), ('health', 0.01, 9)]
----------------------------------

Country:  lesotho 

[('clinic', 0.31, 13), ('pharmacy', 0.262, 11), ('health centre', 0.143, 6), ('care pharmacy', 0.048, 2), ('centre', 0.048, 2), ('dentist', 0.048, 2), ('doctor', 0.048, 2), ('chemist', 0.048, 2), ('seventh', 0.048, 2), ('adventist', 0.048, 2)]
----------------------------------

Country:  liberia 

[('clinic', 0.368, 270), ('medicine store', 0.106, 78), ('community clinic', 0.049, 36), ('medical clinic', 0.035, 26), ('health center', 0.035, 26), ('town clinic', 0.03, 22), ('hospital', 0.025, 18), ('medical center', 0.016, 12), ('pharmacy', 0.014, 10), ('pharmaci

[('clicks', 0.176, 158), ('pharmacy', 0.12, 108), ('clinic', 0.098, 88), ('dr', 0.05, 45), ('dischem', 0.048, 43), ('hospital', 0.024, 22), ('link', 0.02, 18), ('medical centre', 0.018, 16), ('apteek', 0.017, 15), ('dental', 0.013, 12), ('de', 0.012, 11), ('centre', 0.011, 10), ('clicks pharmacy', 0.01, 9), ('health', 0.01, 9), ('dentist', 0.01, 9)]
----------------------------------

Country:  south sudan 

[('clinic', 0.174, 8), ('phcc', 0.174, 8), ('phcu', 0.152, 7), ('hospital', 0.109, 5), ('pharmacy', 0.087, 4), ('feeding centre', 0.065, 3), ('phu', 0.065, 3), ('primary healthcare', 0.043, 2), ('msf', 0.043, 2)]
----------------------------------

Country:  sudan 

[('sydly', 0.432, 426), ('ltby', 0.026, 26), ('mrkz', 0.024, 24), ('mstshfy', 0.022, 22), ('lhdyth', 0.018, 18), ('sydly d', 0.017, 17), ('2', 0.014, 14), ('sydly bw', 0.012, 12), ('ltb lsnn', 0.011, 11), ('mstshf', 0.01, 10)]
----------------------------------

Country:  togo 

[('pharmacie', 0.243, 83), ('cms', 0.044,

In [42]:
result_df = result_df.dropna(subset=['Type']).reset_index(drop=True)

In [43]:
# Format the results from the common phrases finding process to a DataFrame
result_df = pd.DataFrame(country_and_common_facility_types.items(),columns=['Country','Type'])
result_df = result_df.explode('Type')
result_df[['Type','Proportion within Country','Count']] = pd.DataFrame(result_df['Type'].tolist(), index = result_df.index)
result_df.Country = result_df.Country.apply(str.lower)
result_df.Type = result_df.Type.apply(str).apply(str.lower) # $ update
result_df = result_df.sort_values(['Country','Proportion within Country'], ascending=[True, False]).reset_index(drop=True)

In [45]:
result_df.head(20)

Unnamed: 0,Country,Type,Proportion within Country,Count
0,angola,farmacia,0.451,134.0
1,angola,centro medico,0.054,16.0
2,angola,clinica,0.047,14.0
3,angola,consultorio medico,0.024,7.0
4,angola,ii,0.017,5.0
5,angola,centro de saude,0.01,3.0
6,angola,deposito de medicamentos,0.01,3.0
7,angola,hospital,0.01,3.0
8,angola,mecofarma,0.01,3.0
9,angola,das,0.01,3.0


### Computer aided correction

In [None]:
# From the result above, we can see that common phrases include tokens that might not 
# be description health facility type, but rather places where the facilities are at

# One solution is to check if the first token in the matched type is a place in that country
# or if the token has some speical meaning...

result_df['first_token'] = result_df['Type'].apply(lambda x: x.split()[0])

# To narrow the scope we need to check, first identify some tokens that are obviously part of facility type:
tokens_that_are_part_of_facility_type = ['health', 'centre', 'hospital', 'clinic', 'hopital', 'district', 'postos', 'st', 'community', 'centro', 'primary', 'ii', 'mission', 'rural', 'poste', '2', 'dispensaire', '1', 'dispensary', 'general', 'central', 'hospitais', 'government', 'provincial', 'i', 'basic', 'mini', 'town', 'polyclinique', 'regional', 'clinique', 'hospitalier', 'urbain', 'public', 'de', 'referral', 'new', 'maternal', 'catholique'] # print(result_df['first_token'].value_counts()[result_df['first_token'].value_counts()>1].index.tolist())

# The dataframe below contains the facility types that might not be a valid facility type.
potential_places_as_prefix = result_df[~result_df['first_token'].isin(tokens_that_are_part_of_facility_type)].copy()

In [None]:
from scraper_functions import *

In [None]:
# Automate the process of searching these token on Google, record our decision when seeing the search results

def ask_google(list_of_search_keyword, input_guide):
    answers = []
    for search_keyword in list_of_search_keyword:
        go_to_page(driver, 'https://www.google.com/search?q='+search_keyword)
        answers.append(input(input_guide))
        print('\n')
    return answers

list_of_search_keyword = (potential_places_as_prefix['Country'] + '+' + potential_places_as_prefix['first_token']).to_list()

answers = ask_google(list_of_search_keyword, 'Is this a place or place type or facility or unable to tell? (p/t/f/u)')
potential_places_as_prefix['category_of_first_token'] = answers

In [None]:
# # The recorded answers from previous run:
potential_places_as_prefix['category_of_first_token'] = list('ttfpppfttutuffuptuppupppufpppppuppppuuuutptpppptppppffpufupppppppufftfpfptuffffffuffftfttftftffuppppufpfffpffffppppppppppfpppppppppppfpppptpffpfffuutupupptfuufpuupppuupfuuupppfffppufppufppppp')

During the Google Search process, take notes of the special abbreviation:

- RCH : Reproductive Child Health
- GFPA : Gambia Family Planning Association
- AIC : Africa Inland Church
- SDA : Seventh Day Adventist
- PHC : Primary Health Care
- FOSACOM : Formation Sanitaire Communautaire
- OPD : Out-Patient Department
- SRCS : Somali Red Crescent Society
- CHC : Community Health Center
- RC : Red Cross
- ACK : Anglican Church of Kenya
- UVS : Unites Villageoises de Sante

In [None]:
# # Inspect the results from our Google Search process
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 'p']
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 't']
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 'f']
# potential_places_as_prefix[potential_places_as_prefix.category_of_first_token == 'u']

In [None]:
# Map the category we recorded back to the main result dataframe
token_to_category_mapping = potential_places_as_prefix.set_index('first_token')['category_of_first_token'].to_dict()
result_df['category_of_first_token'] = result_df['first_token'].apply(lambda x: token_to_category_mapping.get(x, 'f'))
result_df['category_of_first_token'].value_counts()

In [None]:
# Identify the facility types that are common enough to be considered new facility types
new_facility_types = ['municipal hospital', 'sede health centre', 'vaz postos sanitarios', 'pedro postos sanitarios', 'est centre de sante', 'gare centre de sante integre', 'adi health station', 'sud health centre', 'area health centre', 'sud poste de sante', 'county hospital'] # print(result_df.loc[result_df['category_of_first_token']=='t','Type'].tolist())
result_df.loc[result_df['Type'].isin(new_facility_types), 'category_of_first_token'] = 'f'
result_df.loc[result_df['Type'].apply(lambda x: x.isnumeric()), 'category_of_first_token'] = 'u'
result_df = result_df[~result_df['category_of_first_token'].isin(['p', 't', 'u'])].reset_index(drop=True).drop(['first_token','category_of_first_token'], axis=1)

In [None]:
result_df.to_csv('../B_semi_data/type_dict_tim.csv',index=False)

In [None]:
result_df['Country-Type'] = result_df['Country'] + ' - ' +result_df['Type']

### Compare extracted types with type_dict_augmented_1130

In [None]:
type_dict_1130 = pd.read_csv('../github_downloaded_datasets/type_dict_augmented_1130.csv')
type_dict_1130.Country = type_dict_1130.Country.apply(str.lower)
type_dict_1130.Type = type_dict_1130.Type.apply(str.lower)
type_dict_1130 = type_dict_1130.sort_values(['Country','Type']).reset_index(drop=True)
type_dict_1130['Country-Type'] = type_dict_1130['Country'] + ' - ' +type_dict_1130['Type']

In [None]:
assert(sorted(result_df.Country.unique().tolist()) == sorted(type_dict_1130.Country.unique().tolist()))

In [None]:
agreed_country_types = result_df[result_df['Country-Type'].isin(type_dict_1130['Country-Type'].tolist())].reset_index(drop=True)

agreed_country_types

In [None]:
unique_to_type_dict_1130 = type_dict_1130[~type_dict_1130['Country-Type'].isin(result_df['Country-Type'].tolist())].reset_index(drop=True)

unique_to_type_dict_1130

In [None]:
df[(df.Country=='angola')&(df.lowered_cleaned_name.str.contains('centro de saude'))]

In [None]:
unique_to_result_df = result_df[~result_df['Country-Type'].isin(type_dict_1130['Country-Type'].tolist())].reset_index(drop=True)

unique_to_result_df

### Compare extracted types with the types that comes with nature dataset

In [None]:
country_to_facility_types_mapping = result_df.groupby('Country').agg({'Type':list}).to_dict()['Type']

In [None]:
def get_matched_type_and_pure_name(country_name, name):

    facility_types = country_to_facility_types_mapping[country_name]

    facility_types = sorted(facility_types, key = lambda x: -len(x.split()))

    matched_type = np.nan
    for facility_type in facility_types:
        if ' '+facility_type+' ' in ' '+name+' ':
            matched_type = facility_type
            break

    if isinstance(matched_type,str):
        pure_name = name.replace(matched_type,'').strip()
    else:
        pure_name = np.nan
        
    return (matched_type,pure_name)

In [None]:
df['Type'] = df['Type'].apply(str.lower).apply(deaccent)

df[['matched_type','pure_name']] = pd.DataFrame(df[['Country','lowered_cleaned_name']].apply(lambda row: get_matched_type_and_pure_name(row['Country'].lower(), row['lowered_cleaned_name']), axis=1).tolist(), index= df.index)

In [None]:
print('\n'+str(np.round(df.matched_type.notnull().mean()*100,2))+'% of the nature dataset is covered by the extracted types.')

print('\nFor '+str(np.round(sum(df.Type==df.matched_type)/len(df)*100,2))+'% of the nature dataset, provided type and extracted type are exactly the same.')

different_type = df.loc[(df.Type!=df.matched_type) & df.Type.notnull() & df.matched_type.notnull(), ['Country','name','Type','matched_type']]

print('\nFor '+str(np.round(len(different_type)/len(df)*100,2))+'% of the nature dataset, the provided type and extracted type are different. Below are some examples:\n')

different_type.drop_duplicates(subset=['Type','matched_type']).rename(columns={'Type':'original_type'})[:20]