### Import the Automatic Type Extraction function

In [None]:
from extract_types import *
from util import *

### Prepare the WHO Sub-saharan Dataset 

In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate
df = pd.read_csv('data/sub-saharan_health_facilities.csv')
df = df.rename(columns = {'Facility n':'name','Facility t':'given_type'})
df['orig_name'] = df['name'].copy()
df['name'] = df['name'].apply(lambda x: np.nan if x.strip() == '' else x)
df = df.dropna(subset=['name']).copy()
df['name'] = df['name'].apply(clean_string)
df['given_type'] = df['given_type'].apply(clean_string)

### Demonstrate the Results from Common Bottom-up Approach

In [None]:
name_list = df.sample(frac=0.1,random_state=0).loc[:, 'name'].tolist()
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 5))
ngram_sparse_matrix = count_vectorizer.fit_transform(name_list)
vocabulary = count_vectorizer.vocabulary_
ngram_counts = ngram_sparse_matrix.toarray().sum(axis=0)
ngram_freq_with_sklearn = sorted([(k,round(ngram_counts[i]/(len(df)*0.1),3),ngram_counts[i]) for k,i in vocabulary.items()], key = lambda x: -x[-1])
print(tabulate(ngram_freq_with_sklearn[:18], headers=['Type', 'Proportion', 'Count'], tablefmt='fancy_grid'))

### Run the proposed function on the Nature Dataset

In [None]:
# Get the unique countries in the dataset
list_of_countries = df['Country'].unique().tolist()

# Build a dictionary where entry key is the name of the country, 
# entry value is the list of health facility types in that country
country_and_common_facility_types = {}

# Iterate through all the unique countries
for country_name in list_of_countries:
    

    # Get the part of the data that belongs to this country 
    print('Country: ',country_name,'\n')
    country_part = df.loc[df['Country']== country_name, 'name'].tolist()
    
    common_facility_types_in_this_country = extract_types(country_part, return_proportion = True)
    
    country_and_common_facility_types[country_name] = common_facility_types_in_this_country
    
    print(tabulate(common_facility_types_in_this_country, headers=['Type', 'Proportion', 'Count'], tablefmt='fancy_grid'))
    
    print('\n-------------------------------------------------------------------------------------\n')

In [None]:
# Format the results from the common phrases finding process to a DataFrame
result_df = pd.DataFrame(country_and_common_facility_types.items(),columns=['Country','Type']).explode('Type')
result_df[['Type','Proportion within Country','Count']] = pd.DataFrame(result_df['Type'].tolist(), index = result_df.index)
result_df = result_df.sort_values(['Country','Proportion within Country'], ascending=[True, False]).reset_index(drop=True)

During the Google Search process, take notes of the special abbreviation:

- RCH : Reproductive Child Health
- GFPA : Gambia Family Planning Association
- AIC : Africa Inland Church
- SDA : Seventh Day Adventist
- PHC : Primary Health Care
- FOSACOM : Formation Sanitaire Communautaire
- OPD : Out-Patient Department
- SRCS : Somali Red Crescent Society
- CHC : Community Health Center
- RC : Red Cross
- ACK : Anglican Church of Kenya
- UVS : Unites Villageoises de Sante