In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
from symspellpy import SymSpell
import time
from itertools import islice
from fuzzywuzzy import fuzz
from ordered_set import OrderedSet
from fuzzywuzzy import process

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

In [3]:
# data import
dataDir = r"C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\WHO_ISS"
df_iss = gpd.read_file(dataDir+"\\WHO_ISS.gdb", driver='FileGDB', 
                       layer= 'ISS_1020')
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Cleaned"
filename = "clean_names_types_1117.csv"
path = os.path.join(dataDir, filename)
new_cols = pd.read_csv(path)

In [4]:
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\Cleaned"
type_dict = pd.read_csv(dataDir + "//type_dict_1109.csv")

In [5]:
# get index
df_iss.reset_index(inplace=True)
# merge with new columns including 'clean_name_final'
df_iss = df_iss.merge(new_cols, on='index')

In [6]:
# obtain rows with NA in extract_type and clean_name_final has 2 words or more
sample = df_iss[(pd.isna(df_iss['extract_type']))&(df_iss['clean_name_final'].str.count(' ')>0)]

In [7]:
print("Number of data points:", sample.shape[0])

Number of data points: 24708


In [8]:
print("Distribution of data points by country:")
s = sample['Country'].value_counts()
s

Distribution of data points by country:


NIGERIA                             14123
DEMOCRATIC REPUBLIC OF THE CONGO     1792
CHAD                                 1652
MALI                                 1208
LIBERIA                               616
SOUTH SUDAN                           615
MADAGASCAR                            599
CAMEROON                              596
GABON                                 588
NIGER                                 390
CONGO                                 382
CENTRAL AFRICAN REPUBLIC              308
ANGOLA                                256
COTE D'IVOIRE                         215
SENEGAL                               122
BENIN                                 108
EQUATORIAL GUINEA                     107
MALAWI                                103
SOUTH AFRICA                          103
GHANA                                  96
ETHIOPIA                               87
UGANDA                                 83
GUINEA BISSAU                          83
GUINEA                            

In [9]:
# Divide countries into 4 tiers using the number of data points
above_10k = s[s>10000].index.to_list()
btw_1k_10k = s[(s>=1000)&(s<10000)].index.to_list()
btw_100_1k = s[(s>=100)&(s<1000)].index.to_list()
under_100 = s[s<100].index.to_list()

In [10]:
# ignore countries with fewer than 100 data points
countries = above_10k + btw_1k_10k + btw_100_1k

In [11]:
# Write cleaned names into txt file
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\type dict"
for country in countries:
    tmp = sample[sample['Country']==country]
    filename = country+".txt"
    file1 = open(os.path.join(saveDir, filename),"w")
    file1.write(' '.join(list(tmp['clean_name_final'].str.lower()))) 
    file1.close() 

In [12]:
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\type dict"
# generate word frequency dictionary
freq_dict = {}
for country in countries:
    sym_spell = SymSpell()
    filename = country+".txt"
    corpus_path = os.path.join(saveDir, filename)
    sym_spell.create_dictionary(corpus_path)

    d = sym_spell.words
    # sort in decreasing frequency
    sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: -item[1])}
    freq_dict[country] = sorted_d

In [15]:
# Example word frequency dictionary
print(list(islice(freq_dict["GABON"].items(),5)))

[('infirmerie', 121), ('cabinet', 84), ('07', 82), ('chr', 79), ('bele', 74)]


Methodology:

- Select top 10 or top 20 most frequent words with at least some frequency.
- Hand pick words that might contain type information.
- Randomly sample rows where facility name contains the words picked in the previous step.
- Add new rows to type dictionary if applicable.

# Between 100 and 10k

Examine top 10 words with highest frequency and frequency > 20

In [13]:
cols = ['Country', 'name_of_facility_visited', 'clean_name_final', 'type_of_facility_visited']

In [14]:
len(btw_100_1k)

15

In [21]:
country = btw_100_1k[0]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: LIBERIA
Top 10 words:
town 85
st 40
community 32
medical 26
j 25
g 22
francis 21


In [77]:
words = ['community', 'medical']

In [78]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
189,Liberia,Clinic,CLINIC,661.0
190,Liberia,General Hospital,CH,
191,Liberia,Health Centre,HC,41.0
192,Liberia,Hospital,HOSP,29.0
193,Liberia,Mission Hospital,MH,8.0
194,Liberia,National Referral Hospital,NRH,1.0


In [79]:
# sample 10 rows where facility name contains that word
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
43442,LIBERIA,Cinta Town Community,Cinta Town Community,PHC_CENTER
41461,LIBERIA,New Community,New Community,PHC_CENTER
42044,LIBERIA,Flumpa community,Flumpa Community,PHC_CENTER
44862,LIBERIA,Yeamei community,Yeamei Community,PHC_CENTER
41833,LIBERIA,Weala Methodist community,Weala Methodist Community,PHC_CENTER
44236,LIBERIA,Kaneh Community,Kaneh Community,PHC_CENTER
42240,LIBERIA,Yarnwullif community,Yarnwullif Community,PHC_CENTER
43545,LIBERIA,Yeamei community,Yeamei Community,PHC_CENTER
43432,LIBERIA,Flumpa community,Flumpa Community,PHC_CENTER
42312,LIBERIA,Weala community,Weala Community,PHC_CENTER


In [83]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
44359,LIBERIA,Faith Medical,Faith Medical,MCH_CARE_CENTER
41311,LIBERIA,Lukambeh medical,Lukambeh Medical,PHC_CENTER
44571,LIBERIA,Afro medical,Afro Medical,PHC_CENTER
44696,LIBERIA,Faith Medical,Faith Medical,MCH_CARE_CENTER
41473,LIBERIA,Zobo Medical,Zobo Medical,PHC_CENTER
41256,LIBERIA,Dutch Liberian Medical,Dutch Liberian Medical,PHC_CENTER
43012,LIBERIA,All Grace Medical,All Grace Medical,PHC_CENTER
41247,LIBERIA,Anatah Medical,Anatah Medical,PHC_CENTER
41990,LIBERIA,12th Street Medical,12Th Street Medical,PHC_CENTER
44275,LIBERIA,Patience medical,Patience Medical,PHC_CENTER


In [22]:
country = btw_100_1k[1]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: SOUTH SUDAN
Top 10 words:
poc 164
clinic 151
sector 124
iom 101
malakal 97
imc 73
aa 59
2 55
1 46
mayen 45


In [23]:
words = ['poc', 'clinic', 'iom', 'imc']

In [24]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
333,South Sudan,County Hospital,CH,28.0
334,South Sudan,Hospital,HOSP,
335,South Sudan,Primary Health Care Centre,PHCC,332.0
336,South Sudan,Primary Health Care Unit,PHCU,1375.0
337,South Sudan,State Hospital,SH,9.0
338,South Sudan,Teaching Hospital,TH,3.0


In [25]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
292604,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC
292031,SOUTH SUDAN,POC AA IMC clinic,Poc Aa Imc Clinic,PHCC
293017,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC
293753,SOUTH SUDAN,Malakal POC Sector 1 IMC,Malakal Poc Sector 1 Imc,PHCC
293135,SOUTH SUDAN,Malakal POC Sector 1 IMC,Malakal Poc Sector 1 Imc,PHCC
293013,SOUTH SUDAN,POC AA IMC clinic,Poc Aa Imc Clinic,PHCC
292505,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC
293300,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC
293495,SOUTH SUDAN,Malakal POC Sector 2 IOM,Malakal Poc Sector 2 Iom,PHCC
293227,SOUTH SUDAN,POC AA IMC clinic,Poc Aa Imc Clinic,PHCC


In [26]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
292557,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC
293055,SOUTH SUDAN,Mandeng private clinic,Mandeng Private Clinic,PRIVATE_FACILITY
292787,SOUTH SUDAN,Bahr El Ghazal Clinic,Bahr El Ghazal Clinic,PRIVATE_FACILITY
292558,SOUTH SUDAN,Bahr El Ghazal Clinic,Bahr El Ghazal Clinic,PRIVATE_FACILITY
293266,SOUTH SUDAN,POC AA IMC clinic,Poc Aa Imc Clinic,PHCC
291090,SOUTH SUDAN,Tuochriak Mobile Clinic,Tuochriak Mobile Clinic,PHCU
292520,SOUTH SUDAN,Rom clinic,Rom Clinic,PRIVATE_FACILITY
291687,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC
292291,SOUTH SUDAN,Abiemnom Life Clinic,Abiemnom Life Clinic,PRIVATE_FACILITY
293410,SOUTH SUDAN,Poc AA IOM Clinic,Poc Aa Iom Clinic,PHCC


In [27]:
country = btw_100_1k[2]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: MADAGASCAR
Top 10 words:
sud 63
i 46
ville 45
tsiroanomandidy 37
tanambao 29
nord 26
anosibe 26
an 26
ala 26
soanierana 25


In [28]:
words = ['sud']

In [29]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
195,Madagascar,Basic Health Center I,CSB1,
196,Madagascar,Basic Health Center II,CSB2,
197,Madagascar,District Hospital,CHD1,
198,Madagascar,Health Centre,HC,1642.0
199,Madagascar,Health Post,HP,910.0
200,Madagascar,Hospital,HOSP,125.0


In [30]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
46907,MADAGASCAR,ANDILANA SUD,Andilana Sud,Basic_Health_Center_II
47529,MADAGASCAR,MAROVOAY SUD,Marovoay Sud,Basic_Health_Center_II
47855,MADAGASCAR,ANKAZOABO SUD,Ankazoabo Sud,Basic_Health_Center_II
47776,MADAGASCAR,BETIOKY SUD,Betioky Sud,Basic_Health_Center_II
47532,MADAGASCAR,MAROVOAY SUD,Marovoay Sud,Basic_Health_Center_II
46714,MADAGASCAR,MANAMBOTRA SUD,Manambotra Sud,Basic_Health_Center_II
47961,MADAGASCAR,AMBOASARY SUD,Amboasary Sud,Basic_Health_Center_II
48563,MADAGASCAR,SAHANIVOTRY SUD,Sahanivotry Sud,Basic_Health_Center_II
48478,MADAGASCAR,CHRDI AMBOASARY SUD,Chrdi Amboasary Sud,DISTRICT_HOSP
45498,MADAGASCAR,ANDREBAKELY SUD,Andrebakely Sud,Basic_Health_Center_II


In [64]:
country = btw_100_1k[3]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: CAMEROON
Top 10 words:
cm 96
csc 75
csp 62
cabinet 51
soins 43
la 33
fondation 32
csiu 28
ebolowa 28
catholique 23


In [65]:
words = ['cm', 'csc', 'csp', 'cabinet', 'csiu']

In [66]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
42,Cameroon,Centre Medical d'Arrondissement,CMA,223.0
43,Cameroon,Centre de Sante,CS,
44,Cameroon,Centre de Sante Integre,CSI,2241.0
45,Cameroon,Clinic,CLINIC,3.0
46,Cameroon,Dispensary,DISP,48.0
47,Cameroon,Health Centre,HC,362.0
48,Cameroon,Hospital,HOSP,
49,Cameroon,Hospital Centraux,HOSPC,2.0
50,Cameroon,Hospital General,HG,3.0
51,Cameroon,Hospital Regional,HR,14.0


In [67]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
7160,CAMEROON,CM SOSUCAM1,Cm Sosucam1,PRIVATE_FACILITY
9514,CAMEROON,Cm Sosucam2,Cm Sosucam2,PRIVATE_FACILITY
7268,CAMEROON,CMS/CNPS,Cms Cnps,PRIVATE_FACILITY
9283,CAMEROON,CM SOSUCAM1,Cm Sosucam1,PRIVATE_FACILITY
9175,CAMEROON,CME Fondation Chantal Biya,Cme Fondation Chantal Biya,TEACHING_HOSP
7000,CAMEROON,Cm Escadron 14,Cm Escadron 14,MILITARY
7712,CAMEROON,Cm Baptiste,Cm Baptiste,PRIVATE_FACILITY
12137,CAMEROON,CM La Passerelle,Cm La Passerelle,CMA
8018,CAMEROON,CM Notre Dame de la Merci,Cm Notre Dame La Merci,PRIVATE_FACILITY
11976,CAMEROON,CM SOSUCAM1,Cm Sosucam1,PRIVATE_FACILITY


In [68]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
7818,CAMEROON,CSC de Dimako,Csc Dimako,CSI
11545,CAMEROON,Csc Ndelele,Csc Ndelele,CSI
6101,CAMEROON,CSC de Letta,Csc Letta,FAITH_BASED
7904,CAMEROON,CSC Abong-Mbang,Csc Abong Mbang,FAITH_BASED
8131,CAMEROON,CSC de Borongo,Csc Borongo,CSI
10835,CAMEROON,CSC de Guili,Csc Guili,CSI
11698,CAMEROON,CSC MVOLYE,Csc Mvolye,FAITH_BASED
12368,CAMEROON,CSC elat-Minkom,Csc Elat Minkom,FAITH_BASED
11296,CAMEROON,CSC Yokadouma,Csc Yokadouma,CSI
8904,CAMEROON,CSC elat-Minkom,Csc Elat Minkom,FAITH_BASED


In [69]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
5482,CAMEROON,CSP Catholique Mayo-Darle (cnls),Csp Catholique Mayo Darle Cnls,FAITH_BASED
7877,CAMEROON,CSP Bessaye,Csp Bessaye,PRIVATE_FACILITY
12602,CAMEROON,CSP Baptiste Nyamboya,Csp Baptiste Nyamboya,FAITH_BASED
11891,CAMEROON,CSPP Dang,Cspp Dang,FAITH_BASED
6087,CAMEROON,CSP le bien,Csp Le Bien,PRIVATE_FACILITY
7710,CAMEROON,CSP Bessaye,Csp Bessaye,PRIVATE_FACILITY
6246,CAMEROON,CSP Islamique,Csp Islamique,CSI
7770,CAMEROON,CSP la Compassion (CSP Ndokayo),Csp La Compassion Csp Ndokayo,PRIVATE_FACILITY
11964,CAMEROON,CSP Afanetouana,Csp Afanetouana,PRIVATE_FACILITY
11199,CAMEROON,CSP Baptiste Nyamboya,Csp Baptiste Nyamboya,FAITH_BASED


In [70]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[3], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
6128,CAMEROON,Cabinet de Soins le Bon secours,Cabinet Soins Le Bon Secours,PRIVATE_FACILITY
5911,CAMEROON,Cabinet de soin de Bon Samaritain,Cabinet Soin Bon Samaritain,PRIVATE_FACILITY
12348,CAMEROON,Cabinet de Soins Acacias,Cabinet Soins Acacias,PRIVATE_FACILITY
9191,CAMEROON,Cabinet de soins St Etienne (cnls),Cabinet Soins St Etienne Cnls,PRIVATE_FACILITY
12676,CAMEROON,Cabinet Medico-sanitaire LA BENEDICTION,Cabinet Medico Sanitaire La Benediction,PRIVATE_FACILITY
5134,CAMEROON,Cabinet de Soins Grace Divine,Cabinet Soins Grace Divine,CSI
6664,CAMEROON,Cabinet De Soins La Grace,Cabinet Soins La Grace,PRIVATE_FACILITY
12339,CAMEROON,Cabinet de Soins Notre Dame du Rosaire,Cabinet Soins Notre Dame Rosaire,FAITH_BASED
12351,CAMEROON,Cabinet de Soins Notre Dame du Rosaire,Cabinet Soins Notre Dame Rosaire,FAITH_BASED
11469,CAMEROON,Cabinet Soins La Solidarite,Cabinet Soins La Solidarite,PRIVATE_FACILITY


In [71]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[4], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
5613,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
8619,CAMEROON,CSIU N°2 Ebolowa,Csiu Ndeg2 Ebolowa,CSI
10713,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
10188,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
5727,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
10131,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
10186,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
5645,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
5679,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI
5639,CAMEROON,CSIU N°1 Ebolowa,Csiu Ndeg1 Ebolowa,CSI


In [72]:
country = btw_100_1k[4]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: GABON
Top 10 words:
infirmerie 121
cabinet 84
07 82
chr 79
bele 74
port 72
gentil 72
la 49
lycee 40
base 32


In [73]:
words = ['infirmerie', 'cabinet', 'chr']

In [74]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
132,Gabon,Centre Hospitalier Urban,CHU,1.0
133,Gabon,Centre de Sante,CS,
134,Gabon,Centre de Sante Urban,CSU,4.0
135,Gabon,Clinic,CLINIC,
136,Gabon,Dispensary,DISP,387.0
137,Gabon,Health Centre,HC,92.0
138,Gabon,Hospital,HOSP,
139,Gabon,Hospital Cooperation,HOSPC,3.0
140,Gabon,Medical Centre,MC,43.0
141,Gabon,Polyclinic,PCLINIC,


In [75]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
30492,GABON,Infirmerie Lycée General Nazaire Boulingui,Infirmerie Lycee General Nazaire Boulingui,CSI
31688,GABON,Infirmerie de la CEB,Infirmerie La Ceb,PRIVATE_FACILITY
30596,GABON,Infirmerie de la gare,Infirmerie La Gare,PRIVATE_FACILITY
32140,GABON,Infirmerie de la gare,Infirmerie La Gare,PRIVATE_FACILITY
32988,GABON,Infirmerie de la CEB,Infirmerie La Ceb,PRIVATE_FACILITY
31173,GABON,Infirmerie de la CEB,Infirmerie La Ceb,PRIVATE_FACILITY
32352,GABON,Infirmerie de la prison centrale de Tchibanga,Infirmerie La Prison Centrale Tchibanga,CSI
31761,GABON,Infirmerie 7eme region,Infirmerie 7Eme Region,MILITARY
31481,GABON,Infirmerie Lycée Horizon de La Mission Catholique,Infirmerie Lycee Horizon La Mission Catholique,CSI
31714,GABON,Infirmerie du lycée publique François,Infirmerie Lycee Publique Francois,PRIVATE_FACILITY


In [77]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
31125,GABON,Cabinet de kinésithérapie Mbouassing,Cabinet Kinesitherapie Mbouassing,KINESITHERAPIE
31777,GABON,Cabinet Sainte Paulette,Cabinet Sainte Paulette,PRIVATE_FACILITY
31774,GABON,Cabinet de soins Misercirde,Cabinet Soins Misercirde,PRIVATE_FACILITY
31574,GABON,Cabinet Sainte Paulette,Cabinet Sainte Paulette,PRIVATE_FACILITY
32244,GABON,Cabinet Mbolo,Cabinet Mbolo,PRIVATE_FACILITY
31396,GABON,Cabinet Sainte Paulette,Cabinet Sainte Paulette,PRIVATE_FACILITY
32129,GABON,Cabinet de Groupe,Cabinet Groupe,PRIVATE_FACILITY
31512,GABON,Cabinet de Groupe,Cabinet Groupe,PRIVATE_FACILITY
32333,GABON,Cabinet de soins miséricorde,Cabinet Soins Misericorde,PRIVATE_FACILITY
30602,GABON,Cabinet espoir,Cabinet Espoir,PRIVATE_FACILITY


In [78]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
33242,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33594,GABON,CHRPM de Koulamoutou,Chrpm Koulamoutou,STATE_HOSP
32872,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33473,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33235,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33258,GABON,SMI CHRGRL,Smi Chrgrl,STATE_HOSP
33525,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33545,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33476,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP
33579,GABON,CHR Port-Gentil,Chr Port Gentil,STATE_HOSP


In [79]:
country = btw_100_1k[5]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: NIGER
Top 10 words:
n 46
case 40
sante 34


In [80]:
words = ['case', 'sante']

In [81]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
250,Niger,Centre Hospitalier Universitaire,CHU,1.0
251,Niger,Health Hut,HH,2009.0
252,Niger,Hospital,HOSP,40.0
253,Niger,Integrated Health Centre,CSI,836.0


In [82]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
64103,NIGER,CASE DE SANTÉ,Case Sante,CSI
63773,NIGER,Case de Sante de Boulongou Yaskou,Case Sante Boulongou Yaskou,CSI
64967,NIGER,Case de Santé ANIELE,Case Sante Aniele,CSI
64982,NIGER,case de santé Abalama,Case Sante Abalama,CSI
65260,NIGER,Case de oudi peulh,Case Oudi Peulh,CSI
63847,NIGER,Case de santé d' Oudi peulh,Case Sante D Oudi Peulh,OTHER_NON_ORTHORDOX_HC
65311,NIGER,Case de santé,Case Sante,OTHER_NON_ORTHORDOX_HC
63825,NIGER,CASE DE SANTÉ DE BONEGRAL,Case Sante Bonegral,CSI
63794,NIGER,CASE DE SANTÉ DE N'GAGALA,Case Sante N Gagala,CSI
65205,NIGER,CASE DE SANTE LAZARET,Case Sante Lazaret,CSI


In [83]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
63671,NIGER,CASE DE SANTE KORTINIRGA,Case Sante Kortinirga,CSI
65455,NIGER,CASE DE SANTÉ KAWA,Case Sante Kawa,CSI
65205,NIGER,CASE DE SANTE LAZARET,Case Sante Lazaret,CSI
64967,NIGER,Case de Santé ANIELE,Case Sante Aniele,CSI
63816,NIGER,CASE DE SANTE BONEGRAL,Case Sante Bonegral,CSI
64947,NIGER,Case de Santé ANIELE,Case Sante Aniele,CSI
64982,NIGER,case de santé Abalama,Case Sante Abalama,CSI
63793,NIGER,CASE DE SANTÉ DE BIRZOWEYA,Case Sante Birzoweya,CSI
63847,NIGER,Case de santé d' Oudi peulh,Case Sante D Oudi Peulh,OTHER_NON_ORTHORDOX_HC
64103,NIGER,CASE DE SANTÉ,Case Sante,CSI


In [84]:
country = btw_100_1k[6]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: CONGO
Top 10 words:
cabinet 67
cms 52
soins 48
clinic 31
la 24


In [85]:
words = ['cabinet', 'cms', 'clinic']

In [86]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
80,Congo,Centre de Sante Integre,CSI,302.0
81,Congo,Hospital,HOSP,2.0
82,Congo,Hospital Comboutique,HC,9.0
83,Congo,Hospital General,HG,4.0
84,Congo,Poste de Sante,PS,
85,Congo,University Hospital,UH,1.0
86,Congo,l'Hospital de Base,HB,10.0


In [87]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
24818,CONGO,CABINET DE SOINS MAKALÉ,Cabinet Soins Makale,PRIVATE_FACILITY
25007,CONGO,Cabinet le Berger,Cabinet Le Berger,PRIVATE_FACILITY
24932,CONGO,Cabinet de soins oeil SUNDI LOUTETE Bouansa,Cabinet Soins Oeil Sundi Loutete Bouansa,PRIVATE_FACILITY
25078,CONGO,Cabinet de soins la gloire,Cabinet Soins La Gloire,PRIVATE_FACILITY
24675,CONGO,Cabinet de soins Louis Merieux,Cabinet Soins Louis Merieux,PRIVATE_FACILITY
25540,CONGO,Cabinet De Soins La Grace,Cabinet Soins La Grace,PRIVATE_FACILITY
25253,CONGO,CABINET DE SOINS JAPOLEON KINDELE,Cabinet Soins Japoleon Kindele,PRIVATE_FACILITY
24598,CONGO,Cabinet de soins Caba,Cabinet Soins Caba,PRIVATE_FACILITY
25012,CONGO,Cabinet Maman Nsadissa,Cabinet Maman Nsadissa,PRIVATE_FACILITY
25011,CONGO,Cabinet kiniadi,Cabinet Kiniadi,PRIVATE_FACILITY


In [88]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
25965,CONGO,Cms municipal,Cms Municipal,CSI
25820,CONGO,Cms municipal,Cms Municipal,CSI
25283,CONGO,Cms Asv patience,Cms Asv Patience,CMA
25084,CONGO,Cms islamique,Cms Islamique,CMA
25008,CONGO,Cms A.P.B,Cms A P B,CMA
25332,CONGO,CMS MOKABI,Cms Mokabi,PRIVATE_FACILITY
25221,CONGO,Cms soeur Martin ouenze,Cms Soeur Martin Ouenze,CMA
24726,CONGO,CMS seour Martin 2,Cms Seour Martin 2,CSI
25248,CONGO,CMS Soeur Martin,Cms Soeur Martin,CSI
24697,CONGO,CMS Elikia,Cms Elikia,CMA


In [89]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
25562,CONGO,Clinique Moungondo,Clinic Moungondo,PRIVATE_FACILITY
26009,CONGO,Clinique les oliviers,Clinic Les Oliviers,PRIVATE_FACILITY
25547,CONGO,CLINIQUE MÉDICALE MOUNGONDO NKAYI,Clinic Medicale Moungondo Nkayi,PRIVATE_FACILITY
25116,CONGO,Shaty clinique EL-shaddaï,Shaty Clinic El Shaddai,CMA
25023,CONGO,Shaty clinique EL-shaddaï,Shaty Clinic El Shaddai,CMA
25255,CONGO,Clinique Albert LEYONO,Clinic Albert Leyono,DISTRICT_HOSP
25315,CONGO,Clinique Élikia,Clinic Elikia,PRIVATE_FACILITY
24845,CONGO,Clinic leyono,Clinic Leyono,DISTRICT_HOSP
25382,CONGO,Clinique Moungondo,Clinic Moungondo,PRIVATE_FACILITY
24609,CONGO,Clinique securex,Clinic Securex,PRIVATE_FACILITY


In [90]:
country = btw_100_1k[7]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: CENTRAL AFRICAN REPUBLIC
Top 10 words:
csu 80


In [92]:
words = ['csu']

In [91]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
57,Central African Republic,Centre de Sante,CS,2.0
58,Central African Republic,"Centre de Sante ""A""",CSA,43.0
59,Central African Republic,"Centre de Sante ""B""",CSB,44.0
60,Central African Republic,"Centre de Sante ""C""",CSC,57.0
61,Central African Republic,"Centre de Sante ""D""",CSD,2.0
62,Central African Republic,"Centre de Sante ""E""",CSE,61.0
63,Central African Republic,Hospital Centraux,HC,3.0
64,Central African Republic,Hospital District,HD,
65,Central African Republic,Hospital Prefectoraux,HP,12.0
66,Central African Republic,Hospital Regional Universitaire,HRU,5.0


In [93]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
13965,CENTRAL AFRICAN REPUBLIC,CSU MALIMAKA,Csu Malimaka,CSI
13818,CENTRAL AFRICAN REPUBLIC,CSU DE KOUALLO,Csu Kouallo,CSI
13968,CENTRAL AFRICAN REPUBLIC,CSU CASTOR,Csu Castor,CSI
13971,CENTRAL AFRICAN REPUBLIC,Csu castors,Csu Castors,CSI
13418,CENTRAL AFRICAN REPUBLIC,CSU TOKOYO,Csu Tokoyo,CSI
14328,CENTRAL AFRICAN REPUBLIC,CSU GOBONGO,Csu Gobongo,CSI
13247,CENTRAL AFRICAN REPUBLIC,CSU YAPELE,Csu Yapele,CSI
14270,CENTRAL AFRICAN REPUBLIC,CSU DE Lakouanga,Csu Lakouanga,CSI
13957,CENTRAL AFRICAN REPUBLIC,CSU PETEVO,Csu Petevo,CSI
14274,CENTRAL AFRICAN REPUBLIC,CSU PETEVO,Csu Petevo,CSI


In [95]:
country = btw_100_1k[8]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: ANGOLA
Top 10 words:
comuna 31
sede 23


In [96]:
words = ['comuna', 'sede']

In [97]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
0,Angola,Central Hospital,CH,3.0
1,Angola,Centro Materno Infantil,CMI,39.0
2,Angola,Centro Sanatorio Materno Infantil,CSMI,3.0
3,Angola,Centro de Saude,CS,231.0
4,Angola,General Hospital,GH,3.0
5,Angola,Hospital,HOSP,32.0
6,Angola,Municipal Hospital,MH,100.0
7,Angola,Posto de Saude,PS,1152.0
8,Angola,Provincial Hospital,PH,11.0
9,Angola,Regional Hospital,RH,1.0


In [98]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
1228,ANGOLA,Comuna Kaholo,Comuna Kaholo,OTHER_NON_ORTHORDOX_HC
2937,ANGOLA,COMUNA Sede Luvango.,Comuna Sede Luvango,CSI
1652,ANGOLA,Comuna sede,Comuna Sede,CSI
2943,ANGOLA,COMUNA Capelongo,Comuna Capelongo,DISTRICT_HOSP
2877,ANGOLA,Comuna Hoque,Comuna Hoque,CSI
2152,ANGOLA,Comuna da Huila,Comuna Huila,CSI
2878,ANGOLA,Comuna Hoque,Comuna Hoque,CSI
2170,ANGOLA,Comuna sede vila da Paula,Comuna Sede Vila Paula,CSI
995,ANGOLA,Comuna de Luvaka,Comuna Luvaka,CSI
2966,ANGOLA,Comuna sede,Comuna Sede,CSI


In [99]:
country = btw_100_1k[9]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: COTE D'IVOIRE
Top 10 words:
dr 45
fsu 29
pmi 26
com 21


In [100]:
words = ['fsu', 'pmi', 'com']

In [101]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
87,Cote d'Ivoire,Centre Medico-Social,CMS,33.0
88,Cote d'Ivoire,Centre de Sante Rural,CSR,1330.0
89,Cote d'Ivoire,Centre de Sante Urban,CSU,329.0
90,Cote d'Ivoire,Hospital General,HG,77.0
91,Cote d'Ivoire,Hospitalier Regional,HR,19.0
92,Cote d'Ivoire,Hospitalier Universitaire,HU,4.0


In [102]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
26585,COTE D'IVOIRE,FSU TANGUELAN,Fsu Tanguelan,CSI
26358,COTE D'IVOIRE,FSU COM GESCO,Fsu Com Gesco,CSI
26877,COTE D'IVOIRE,FSU COM KOUTE,Fsu Com Koute,CSI
26883,COTE D'IVOIRE,FSU COM NIANGON SUD,Fsu Com Niangon Sud,CSI
27021,COTE D'IVOIRE,FSU COM VRIDI CANAL,Fsu Com Vridi Canal,CSI
27006,COTE D'IVOIRE,Fsu com Abobo Baoulé,Fsu Com Abobo Baoule,CSI
26941,COTE D'IVOIRE,FSUCOM KOUTE,Fsucom Koute,CSI
26873,COTE D'IVOIRE,FSU COM TOIT ROUGE,Fsu Com Toit Rouge,CSI
26580,COTE D'IVOIRE,FSU 22O LOGTS,Fsu 22O Logts,DISTRICT_HOSP
26955,COTE D'IVOIRE,FSU 220 logements,Fsu 220 Logements,CSI


In [103]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
26198,COTE D'IVOIRE,PMI SINFRA,Pmi Sinfra,CSI
26339,COTE D'IVOIRE,PMI Touba,Pmi Touba,STATE_HOSP
26855,COTE D'IVOIRE,PMI GAGNOA,Pmi Gagnoa,CSI
26862,COTE D'IVOIRE,PMI OUMÉ,Pmi Oume,CSI
26404,COTE D'IVOIRE,PMI DALOA,Pmi Daloa,CSI
26481,COTE D'IVOIRE,PMI Bardot,Pmi Bardot,CSI
26394,COTE D'IVOIRE,PMI ABOISSO,Pmi Aboisso,CSI
26531,COTE D'IVOIRE,PMI Ferke,Pmi Ferke,DISTRICT_HOSP
26317,COTE D'IVOIRE,PMI Korhogo,Pmi Korhogo,CSI
26694,COTE D'IVOIRE,PMI ABENGOUROU,Pmi Abengourou,CSI


In [105]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
26875,COTE D'IVOIRE,FSU COM KOWEÏT,Fsu Com Koweit,CSI
27014,COTE D'IVOIRE,Fsu com adioposoume,Fsu Com Adioposoume,CSI
26872,COTE D'IVOIRE,FSU COM OUASSAKARA,Fsu Com Ouassakara,CSI
26237,COTE D'IVOIRE,FSUCOM TOI ROUGE,Fsucom Toi Rouge,CSI
26987,COTE D'IVOIRE,FSU COM ANONKOUA KOUTE,Fsu Com Anonkoua Koute,FAITH_BASED
26494,COTE D'IVOIRE,FSU Com SAGBE,Fsu Com Sagbe,PRIVATE_FACILITY
26941,COTE D'IVOIRE,FSUCOM KOUTE,Fsucom Koute,CSI
26883,COTE D'IVOIRE,FSU COM NIANGON SUD,Fsu Com Niangon Sud,CSI
26989,COTE D'IVOIRE,FSU COM ANONKOUA KOUTE,Fsu Com Anonkoua Koute,FAITH_BASED
26676,COTE D'IVOIRE,FSU COM Abobo Sagbé,Fsu Com Abobo Sagbe,PRIVATE_FACILITY


In [106]:
country = btw_100_1k[10]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: SENEGAL
Top 10 words:


In [110]:
country = btw_100_1k[11]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: BENIN
Top 10 words:
cabinet 25


In [111]:
words = ['cabinet']

In [112]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
10,Benin,Centre Hospitalier Departemental,CHD,5.0
11,Benin,Centre Medical,CM,24.0
12,Benin,Centre Medico-Social,CMS,3.0
13,Benin,Centre National Hospitalier Universitaire,CNHU,2.0
14,Benin,Centre de Sante,CS,
15,Benin,Centre de Sante Central,CSC,11.0
16,Benin,Centre de Sante d'Arrondissement,CSA,306.0
17,Benin,Centre de Sante de Sous-Prefecture,CSSP,8.0
18,Benin,Centro de Sante de Circonscription Urbane,CSCU,1.0
19,Benin,Clinic,CLINIC,4.0


In [113]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
2996,BENIN,Cabinet de soins privé aafiana,Cabinet Soins Prive Aafiana,PRIVATE_FACILITY
2993,BENIN,CABINET DE SOINS LA GLOIRE/ONG ENTRAIDE,Cabinet Soins La Gloire Ong Entraide,PRIVATE_FACILITY
3281,BENIN,Cabinet HOZIANA,Cabinet Hoziana,PRIVATE_FACILITY
3286,BENIN,Cabinet de Soins ENTRAIDE,Cabinet Soins Entraide,PRIVATE_FACILITY
3295,BENIN,Cabinet de soins la tolérance,Cabinet Soins La Tolerance,PRIVATE_FACILITY
3268,BENIN,CABINET DE SOINS INFIRMIERS DE KOKOBE,Cabinet Soins Infirmiers Kokobe,OTHER_NON_ORTHORDOX_HC
3301,BENIN,CABINET DE SOINS BEKAKOUA,Cabinet Soins Bekakoua,PRIVATE_FACILITY
3025,BENIN,Cabinet de soins Suaaba,Cabinet Soins Suaaba,PRIVATE_FACILITY
2987,BENIN,Cabinet de soins infirmiers la gloire,Cabinet Soins Infirmiers La Gloire,PRIVATE_FACILITY
3296,BENIN,Cabinet de soins somperou,Cabinet Soins Somperou,PRIVATE_FACILITY


In [114]:
country = btw_100_1k[12]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: EQUATORIAL GUINEA
Top 10 words:


In [115]:
country = btw_100_1k[13]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: SOUTH AFRICA
Top 10 words:
paediatric 35
ward 24


In [116]:
words = ["paediatric", "ward"]

In [117]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
321,South Africa,Clinic,CLINIC,3435.0
322,South Africa,Community Health Centre,CHC,284.0
323,South Africa,Community Health Centre (After hours),CHCA,9.0
324,South Africa,Community Health Centre/Clinic,CHCC,8.0
325,South Africa,District Hospital,DH,254.0
326,South Africa,Health Post,HP,34.0
327,South Africa,Hospital,HOSP,
328,South Africa,Medical Centre,MC,1.0
329,South Africa,National Central Hospital,NCH,9.0
330,South Africa,Provincial Tertiary Hospital,PTH,17.0


In [120]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
289590,SOUTH AFRICA,Paediatric wards,Paediatric Wards,DISTRICT_HOSPITAL
289566,SOUTH AFRICA,Paediatric ward,Paediatric Ward,DISTRICT_HOSPITAL
289665,SOUTH AFRICA,Paediatric ward,Paediatric Ward,DISTRICT_HOSPITAL
289606,SOUTH AFRICA,Paediatric wards,Paediatric Wards,DISTRICT_HOSPITAL
289580,SOUTH AFRICA,Paediatric ward,Paediatric Ward,DISTRICT_HOSPITAL
289521,SOUTH AFRICA,Paediatric OPD,Paediatric Opd,DISTRICT_HOSPITAL
289189,SOUTH AFRICA,"Paediatric ward,",Paediatric Ward,DISTRICT_HOSPITAL
289801,SOUTH AFRICA,Paediatric wards,Paediatric Wards,STATE_HOSPITAL
289671,SOUTH AFRICA,Paediatric ward,Paediatric Ward,DISTRICT_HOSPITAL
289188,SOUTH AFRICA,Paediatric unit,Paediatric Unit,PRIVATE_HOSPITAL


In [121]:
country = btw_100_1k[14]
print("Country name:", country)
print("Top 10 words:")
for k,v in islice(freq_dict[country].items(), 10):
    if v>20:
        print(k,v)

Country name: MALAWI
Top 10 words:
st 26


# Between 1k and 10k

Top 20, frequency > 50

In [123]:
len(btw_1k_10k)

3

In [125]:
country = btw_1k_10k[0]
print("Country name:", country)
print("Top 20 words:")
for k,v in islice(freq_dict[country].items(), 20):
    if v>50:
        print(k,v)

Country name: DEMOCRATIC REPUBLIC OF THE CONGO
Top 20 words:
saint 106
la 103
2 96
st 66
1 59


In [132]:
country = btw_1k_10k[1]
print("Country name:", country)
print("Top 20 words:")
for k,v in islice(freq_dict[country].items(), 20):
    if v>50:
        print(k,v)

Country name: CHAD
Top 20 words:
urban 328
2 93
nord 75
est 62
1 60


In [133]:
words = ['urban', 'nord', 'est']

In [134]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
68,Chad,Centre de Sante,CS,
69,Chad,Health Centre,HC,1205.0
70,Chad,Hospital de District,HD,70.0
71,Chad,Hospital de Nationaux,HN,1.0
72,Chad,Hospital de Regional,HR,7.0


In [136]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
15162,CHAD,Moussoro urbain,Moussoro Urban,DISTRICT_HOSP
24388,CHAD,Bousso Urbain,Bousso Urban,DISTRICT_HOSP
23200,CHAD,Mani urbain,Mani Urban,OTHER_NON_ORTHORDOX_HC
15453,CHAD,Dafra urbain,Dafra Urban,OTHER_NON_ORTHORDOX_HC
19488,CHAD,Baktchoro urbain,Baktchoro Urban,OTHER_NON_ORTHORDOX_HC
18747,CHAD,Guereda Urbain,Guereda Urban,OTHER_NON_ORTHORDOX_HC
23912,CHAD,Bousso Urbain,Bousso Urban,DISTRICT_HOSP
18146,CHAD,Bongor Urbain 2,Bongor Urban 2,OTHER_NON_ORTHORDOX_HC
18866,CHAD,Moussoro urbain,Moussoro Urban,DISTRICT_HOSP
21721,CHAD,Pala-Urbain,Pala Urban,OTHER_NON_ORTHORDOX_HC


In [137]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
17781,CHAD,Kyabe Urbain Nord,Kyabe Urban Nord,OTHER_NON_ORTHORDOX_HC
21648,CHAD,ATI Nord,Ati Nord,DISTRICT_HOSP
20691,CHAD,AMTIMAN NORD,Amtiman Nord,OTHER_NON_ORTHORDOX_HC
19135,CHAD,AMTIMAN NORD,Amtiman Nord,OTHER_NON_ORTHORDOX_HC
18371,CHAD,Goundi Nord,Goundi Nord,FAITH_BASED
22869,CHAD,Diguel Nord,Diguel Nord,FAITH_BASED
23555,CHAD,Koumra nord,Koumra Nord,OTHER_NON_ORTHORDOX_HC
24030,CHAD,Diguel Nord,Diguel Nord,OTHER_NON_ORTHORDOX_HC
14880,CHAD,ATI Nord,Ati Nord,DISTRICT_HOSP
15464,CHAD,AMTIMAN NORD,Amtiman Nord,OTHER_NON_ORTHORDOX_HC


In [138]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
24308,CHAD,Biltine EST,Biltine Est,OTHER_NON_ORTHORDOX_HC
17120,CHAD,Mongo ouest,Mongo Ouest,OTHER_NON_ORTHORDOX_HC
19493,CHAD,Biltine EST,Biltine Est,OTHER_NON_ORTHORDOX_HC
23561,CHAD,ATI Est,Ati Est,OTHER_NON_ORTHORDOX_HC
15041,CHAD,Bainamar est,Bainamar Est,DISTRICT_HOSP
24263,CHAD,Biltine EST,Biltine Est,OTHER_NON_ORTHORDOX_HC
22268,CHAD,Diguel Est,Diguel Est,FAITH_BASED
19846,CHAD,Wallia Est,Wallia Est,FAITH_BASED
18473,CHAD,Goundi Est,Goundi Est,FAITH_BASED
21679,CHAD,Diguel Est,Diguel Est,OTHER_NON_ORTHORDOX_HC


In [139]:
country = btw_1k_10k[2]
print("Country name:", country)
print("Top 20 words:")
for k,v in islice(freq_dict[country].items(), 20):
    if v>50:
        print(k,v)

Country name: MALI
Top 20 words:
cabinet 655
medical 485
traditherapeute 79
keneya 58


In [140]:
words = ['cabinet', 'medical', 'traditherapeute']

In [141]:
type_dict[type_dict['Country'].str.upper()==country]

Unnamed: 0,Country,Type,Abbreviation,count
209,Mali,Centre de Kinesitherapie,CK,
210,Mali,Centre de Sante,CS,
211,Mali,Clinic,CLINIC,94.0
212,Mali,Community Health Centre,CSCOM,1294.0
213,Mali,Hospital,HOSP,14.0
214,Mali,Polyclinic,PCLINIC,11.0
215,Mali,Referral Health Centre,CSREF,61.0
216,Mali,Regional Hospital,RH,1.0
217,Mali,University Hospital,UH,3.0


In [144]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
51189,MALI,Cabinet Médical Nani,Cabinet Medical Nani,PRIVATE_FACILITY
51529,MALI,Cabinet de Soins la lumière,Cabinet Soins La Lumiere,PRIVATE_FACILITY
53953,MALI,Cabinet de consultation et de soin pour sage f...,Cabinet Consultation Et Soin Pour Sage Femme,PRIVATE_FACILITY
54468,MALI,Cabinet médical DOUCOURE,Cabinet Medical Doucoure,PRIVATE_FACILITY
54681,MALI,Cabinet de soins Fanto,Cabinet Soins Fanto,PRIVATE_FACILITY
53916,MALI,Cabinet Médical Dr Diabaté,Cabinet Medical Dr Diabate,PRIVATE_FACILITY
53597,MALI,Cabinet Keneya Angolême,Cabinet Keneya Angoleme,TEACHING_HOSP
54362,MALI,Cabinet de consultation et de soins pour sage ...,Cabinet Consultation Et Soins Pour Sage Femme,PRIVATE_FACILITY
55215,MALI,Cabinet Sage Femme Mande,Cabinet Sage Femme Mande,PRIVATE_FACILITY
55058,MALI,Cabinet de traditerapeute Keneyabloulon,Cabinet Traditerapeute Keneyabloulon,TRAD_HEALER


In [145]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
51377,MALI,Tradithérapeute Nama Sory,Traditherapeute Nama Sory,TRAD_HEALER
51349,MALI,Traditherapeute Jean,Traditherapeute Jean,TRAD_HEALER
51093,MALI,Cabinet du tradithérapeute Sado Traoré,Cabinet Traditherapeute Sado Traore,TRAD_HEALER
53087,MALI,Traditherapeute Salifou Doumbia N'Golodougou,Traditherapeute Salifou Doumbia N Golodougou,TRAD_HEALER
51493,MALI,Traditherapeute Abdoulaye,Traditherapeute Abdoulaye,TRAD_HEALER
51209,MALI,Tradithérapeute SadioTraoré,Traditherapeute Sadiotraore,TRAD_HEALER
51403,MALI,Traditherapeute Cissé,Traditherapeute Cisse,TRAD_HEALER
53374,MALI,Traditherapeute Cissé,Traditherapeute Cisse,TRAD_HEALER
50900,MALI,Traditherapeute niena centrale,Traditherapeute Niena Centrale,TRAD_HEALER
51001,MALI,Cabinet tradithérapeute le miracle,Cabinet Traditherapeute Le Miracle,TRAD_HEALER


# 10k and above

Top 20, frequency > 100

In [147]:
len(above_10k)

1

In [149]:
country = above_10k[0]
print("Country name:", country)
print("Top 20 words:")
for k,v in islice(freq_dict[country].items(), 20):
    if v>100:
        print(k,v)

Country name: NIGERIA
Top 20 words:
maternity 1203
chemist 703
informant 610
community 565
camp 388
idp 351
home 338
mallam 310
mai 295
cki 291
town 278
setter 265
bone 251
nursing 250
key 246
wcwc 240
tba 239
ibrahim 231
mal 220
adamu 217


In [150]:
words = ['maternity', 'chemist', 'informant', 'community', 'camp', 'idp',
        'home', 'mai', 'cki', 'town', 'setter', 'bone', 'nursing',
        'key', 'wcwc', 'tba', 'mal']

In [151]:
len(words)

17

In [214]:
type_dict[type_dict['Country']=='Nigeria']

Unnamed: 0,Country,Type,Abbreviation,count
254,Nigeria,Basic Health Centre,BHC,568.0
255,Nigeria,Clinic,CLINIC,4354.0
256,Nigeria,Comprehensive Health Centre,CHC,434.0
257,Nigeria,Comprehensive Primary Health Care,CPHC,
258,Nigeria,Cottage Hospital,CH,149.0
259,Nigeria,Dispensary,DISP,3239.0
260,Nigeria,District Hospital,DH,16.0
261,Nigeria,Family Support Program,FSP,
262,Nigeria,Federal Medical Centre,FMC,19.0
263,Nigeria,General Hospital,GH,529.0


In [152]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
109264,NIGERIA,Ngbo Maternity,Ngbo Maternity,PHC_CENTER
261782,NIGERIA,Dango maternity,Dango Maternity,PHC_CENTER
208249,NIGERIA,Tafawabalewa Town Maternity,Tafawabalewa Town Maternity,PHC_CENTER
66993,NIGERIA,Nafisatu mahmud maternity,Nafisatu Mahmud Maternity,PHC_CENTER
164828,NIGERIA,kujuru maternity,Kujuru Maternity,PHC_CENTER
188978,NIGERIA,Alagarno maternity,Alagarno Maternity,PHC_CENTER
191122,NIGERIA,Kafin Iya maternity,Kafin Iya Maternity,PHC_CENTER
226380,NIGERIA,Ngbo Maternity,Ngbo Maternity,PHC_CENTER
79121,NIGERIA,Urban maternity Azare,Urban Maternity Azare,PHC_CENTER
185981,NIGERIA,BALM OF GRACE MATERNITY,Balm Of Grace Maternity,PRIVATE_FACILITY


In [154]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[1], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
193539,NIGERIA,Habi Chemist,Habi Chemist,PATENT_MED_VENDORS
112495,NIGERIA,Alheri Chemist Ingawa,Alheri Chemist Ingawa,PATENT_MED_VENDORS
186269,NIGERIA,Gabriel Chemist and mobile TBA Kajola,Gabriel Chemist And Mobile Tba Kajola,PATENT_MED_VENDORS
262072,NIGERIA,LADAN CHEMIST,Ladan Chemist,PATENT_MED_VENDORS
196977,NIGERIA,SULEIMAN CHEMIST,Suleiman Chemist,PATENT_MED_VENDORS
194768,NIGERIA,Babankano Chemist,Babankano Chemist,PATENT_MED_VENDORS
195539,NIGERIA,Danladi salihu chemist,Danladi Salihu Chemist,PATENT_MED_VENDORS
178553,NIGERIA,Hudu chemist,Hudu Chemist,PATENT_MED_VENDORS
179150,NIGERIA,"Rahamaniya Chemist, Shira",Rahamaniya Chemist Shira,PATENT_MED_VENDORS
194925,NIGERIA,Dangaske Chemist,Dangaske Chemist,PATENT_MED_VENDORS


In [156]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[2], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
190144,NIGERIA,Community informant Mubarak,Community Informant Mubarak,PATENT_MED_VENDORS
178284,NIGERIA,Key informant,Key Informant,OTHER_NON_ORTHORDOX_HC
196936,NIGERIA,Community informant,Community Informant,PATENT_MED_VENDORS
199122,NIGERIA,Jimma Arzika(community Informant),Jimma Arzika Community Informant,TRAD_SPIRIT_HEALER
240232,NIGERIA,INFORMANT TBA,Informant Tba,TRAD_SPIRIT_HEALER
202862,NIGERIA,Community Informant (Baban Yara),Community Informant Baban Yara,TRAD_SPIRIT_HEALER
181671,NIGERIA,Community key informant,Community Key Informant,TRAD_SPIRIT_HEALER
197404,NIGERIA,Community Informant- Gidan Bature Yashim,Community Informant Gidan Bature Yashim,TRAD_SPIRIT_HEALER
162890,NIGERIA,ChiefJohn Ameh community Informant,Chiefjohn Ameh Community Informant,OTHER_NON_ORTHORDOX_HC
197501,NIGERIA,COMMUNITY KEY INFORMANT (BABANDI DODO),Community Key Informant Babandi Dodo,TRAD_SPIRIT_HEALER


In [157]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[3], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
199746,NIGERIA,Community key informant,Community Key Informant,TRAD_SPIRIT_HEALER
199289,NIGERIA,Community informant,Community Informant,SPIRIT_HEALER
193336,NIGERIA,Kolo aji (Avadar community informant),Kolo Aji Avadar Community Informant,TRAD_SPIRIT_HEALER
188206,NIGERIA,Key community informant,Key Community Informant,TRAD_SPIRIT_HEALER
198292,NIGERIA,Usman Garba(community informants ),Usman Garba Community Informants,TRAD_SPIRIT_HEALER
188382,NIGERIA,Malam mai jega Community informant,Malam Mai Jega Community Informant,OTHER_NON_ORTHORDOX_HC
184274,NIGERIA,Bone Setter (Community Informant),Bone Setter Community Informant,TRAD_SPIRIT_HEALER
197571,NIGERIA,COMMUNITY KEY INFORMANT.(SULEIMAN HASHIM),Community Key Informant Suleiman Hashim,OTHER_NON_ORTHORDOX_HC
244500,NIGERIA,Community informant,Community Informant,TRAD_SPIRIT_HEALER
246327,NIGERIA,Community key informant,Community Key Informant,PATENT_MED_VENDORS


In [159]:
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[4], case=False))][cols].sample(10)

Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
191057,NIGERIA,Kofa IDP Camp,Kofa Idp Camp,PHC_CENTER
229340,NIGERIA,NRC IDP Camp,Nrc Idp Camp,PHC_CENTER
139155,NIGERIA,Teachers village IDP Camp,Teachers Village Idp Camp,PHC_CENTER
166226,NIGERIA,IDP TRANSIT CAMP,Idp Transit Camp,PHC_CENTER
211173,NIGERIA,IDP CAMP KWALE,Idp Camp Kwale,PHC_CENTER
155695,NIGERIA,IDP camp Malkohi,Idp Camp Malkohi,PHC_CENTER
144381,NIGERIA,Ngala ISS IDPS camp,Ngala Iss Idps Camp,PHC_CENTER
208777,NIGERIA,TSANGAYA IDP CAMP CLINI,Tsangaya Idp Camp Clini,PHC_CENTER
155834,NIGERIA,20 HOUSES IDP CAMP CLINC,20 Houses Idp Camp Clinc,PHC_CENTER
270145,NIGERIA,IDP Camp,Idp Camp,PHC_CENTER


In [164]:
print("word:", words[6])
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[6], case=False))][cols].sample(10)

word: home


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
195247,NIGERIA,Dadin Kowa Nursing and Meternity Home,Dadin Kowa Nursing And Meternity Home,PRIVATE_FACILITY
203333,NIGERIA,OPEOLU MATERNITY HOME,Opeolu Maternity Home,PRIVATE_FACILITY
267403,NIGERIA,ALHERI NURSING HOME BOKKOS,Alheri Nursing Home Bokkos,PRIVATE_FACILITY
202314,NIGERIA,TBA HOME,Tba Home,TRAD_SPIRIT_HEALER
158580,NIGERIA,MWAM Nursing and Maternity Home Mirke,Mwam Nursing And Maternity Home Mirke,PRIVATE_FACILITY
258873,NIGERIA,HAMDALA MATERNITY HOME,Hamdala Maternity Home,PRIVATE_FACILITY
108071,NIGERIA,SHOLLYLAD MATERNITY HOME,Shollylad Maternity Home,PHC_CENTER
116871,NIGERIA,Al Hilal Nursing home and maternity,Al Hilal Nursing Home And Maternity,OTHER_NON_ORTHORDOX_HC
196888,NIGERIA,Kainuwa Nursing Home,Kainuwa Nursing Home,PRIVATE_FACILITY
90754,NIGERIA,ALHERI NURSING HOME BOKKOS,Alheri Nursing Home Bokkos,PRIVATE_FACILITY


In [168]:
word = words[7]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: mai


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
196209,NIGERIA,Chindo Maimaganin Gargajiya Gori,Chindo Maimaganin Gargajiya Gori,TRAD_SPIRIT_HEALER
187348,NIGERIA,Mal. Baba Ibrahim mai maganin gargajiya Bambam.,Mal Baba Ibrahim Mai Maganin Gargajiya Bambam,PHC_CENTER
189635,NIGERIA,Kafin maigari,Kafin Maigari,PHC_CENTER
237785,NIGERIA,Mal Abdu Mai Yaji,Mal Abdu Mai Yaji,TRAD_SPIRIT_HEALER
235650,NIGERIA,London Mai Dorowa,London Mai Dorowa,PHC_CENTER
202949,NIGERIA,Abbas Mai Magani,Abbas Mai Magani,TRAD_SPIRIT_HEALER
178923,NIGERIA,Adamuje Mai Magani Yeskawal,Adamuje Mai Magani Yeskawal,TRAD_SPIRIT_HEALER
239976,NIGERIA,Kachalla Ali Mai,Kachalla Ali Mai,PATENT_MED_VENDORS
195581,NIGERIA,Abdullahi mai magani,Abdullahi Mai Magani,TRAD_SPIRIT_HEALER
197794,NIGERIA,Abdullahi Garba Mai Magani,Abdullahi Garba Mai Magani,TRAD_SPIRIT_HEALER


In [170]:
word = words[8]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: cki


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
201600,NIGERIA,CKI Doko Bayan Tasha,Cki Doko Bayan Tasha,PATENT_MED_VENDORS
201894,NIGERIA,CKI Gagarawa Tasha,Cki Gagarawa Tasha,SPIRIT_HEALER
193136,NIGERIA,CKI Mal. Ali Sarkin Aska,Cki Mal Ali Sarkin Aska,TRAD_SPIRIT_HEALER
182126,NIGERIA,Galadiman wanzamai CKI,Galadiman Wanzamai Cki,OTHER_NON_ORTHORDOX_HC
203854,NIGERIA,CKI (TBA) Rabi Musa,Cki Tba Rabi Musa,OTHER_NON_ORTHORDOX_HC
198282,NIGERIA,CKI Galadi Yamma,Cki Galadi Yamma,OTHER_NON_ORTHORDOX_HC
200739,NIGERIA,CKI(Sarkin Magani),Cki Sarkin Magani,OTHER_NON_ORTHORDOX_HC
197639,NIGERIA,SULEIMAN HABU (CKI),Suleiman Habu Cki,TRAD_SPIRIT_HEALER
202156,NIGERIA,CKI Amadu Dan Burin,Cki Amadu Dan Burin,TRAD_SPIRIT_HEALER
201295,NIGERIA,Muku CKI,Muku Cki,TRAD_SPIRIT_HEALER


In [173]:
word = words[9]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: town


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
204099,NIGERIA,Town Maternity,Town Maternity,PHC_CENTER
181861,NIGERIA,"Town Maternity, Alkakeri",Town Maternity Alkakeri,PHC_CENTER
209346,NIGERIA,Dambam Town Maternity,Dambam Town Maternity,PHC_CENTER
213727,NIGERIA,Town Maternity,Town Maternity,PHC_CENTER
142154,NIGERIA,Town Maternity Alkaleri,Town Maternity Alkaleri,PHC_CENTER
234409,NIGERIA,Ningi town maternity,Ningi Town Maternity,PHC_CENTER
211639,NIGERIA,Jama'are Town maternity,Jama Are Town Maternity,PHC_CENTER
147086,NIGERIA,Gamawa Town Maternity,Gamawa Town Maternity,PHC_CENTER
234556,NIGERIA,"Town Maternity, Bununu",Town Maternity Bununu,PHC_CENTER
124853,NIGERIA,Itas Town Maternity,Itas Town Maternity,PHC_CENTER


In [177]:
word = words[10]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: setter


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
241122,NIGERIA,Walawala bone setter,Walawala Bone Setter,TRAD_SPIRIT_HEALER
202398,NIGERIA,MUHAMMADU DANLAMSO Bone Setter,Muhammadu Danlamso Bone Setter,OTHER_NON_ORTHORDOX_HC
194004,NIGERIA,ALH. Isah Bone setter,Alh Isah Bone Setter,OTHER_NON_ORTHORDOX_HC
197430,NIGERIA,Jibrin Adamu (Bone Setter),Jibrin Adamu Bone Setter,TRAD_SPIRIT_HEALER
203559,NIGERIA,Mohammed Ibrahim Bone Setter,Mohammed Ibrahim Bone Setter,TRAD_SPIRIT_HEALER
201764,NIGERIA,Bone Setter,Bone Setter,OTHER_NON_ORTHORDOX_HC
187225,NIGERIA,Alh Ebantu Tradition bone setter,Alh Ebantu Tradition Bone Setter,TRAD_SPIRIT_HEALER
185113,NIGERIA,KPOYE INKIA BONE SETTER,Kpoye Inkia Bone Setter,TRAD_SPIRIT_HEALER
195248,NIGERIA,ABU NEPA BONE SETTER,Abu Nepa Bone Setter,TRAD_SPIRIT_HEALER
197483,NIGERIA,Na'abu Adamu (Bone setter),Na Abu Adamu Bone Setter,TRAD_SPIRIT_HEALER


In [180]:
word = words[12]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: nursing


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
195700,NIGERIA,Kainuwa Nursing Home,Kainuwa Nursing Home,PRIVATE_FACILITY
164451,NIGERIA,Nakowa Nursing Home,Nakowa Nursing Home,PRIVATE_FACILITY
108581,NIGERIA,Dadin Kowa Nursing and Maternity Home,Dadin Kowa Nursing And Maternity Home,PRIVATE_FACILITY
165249,NIGERIA,Haske Weta Nursing Home Song,Haske Weta Nursing Home Song,PRIVATE_FACILITY
157267,NIGERIA,St Francis Nursing Home Bazza.,St Francis Nursing Home Bazza,PRIVATE_FACILITY
244159,NIGERIA,SHALOM NURSING HOME ASHAKA,Shalom Nursing Home Ashaka,PRIVATE_FACILITY
199359,NIGERIA,Ayeboi Nursing and Maternity Home,Ayeboi Nursing And Maternity Home,PRIVATE_FACILITY
94083,NIGERIA,Nasara Nursing and Maternity Home,Nasara Nursing And Maternity Home,PRIVATE_FACILITY
257109,NIGERIA,Dange Nursing home,Dange Nursing Home,PRIVATE_FACILITY
193323,NIGERIA,Victoria Memorial Nursing Home,Victoria Memorial Nursing Home,PHC_CENTER


In [182]:
word = words[13]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: key


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
110841,NIGERIA,Christiana Ogaba Key Informant,Christiana Ogaba Key Informant,OTHER_NON_ORTHORDOX_HC
245433,NIGERIA,Community key informant,Community Key Informant,OTHER_NON_ORTHORDOX_HC
188496,NIGERIA,KASUWAR KOFA KEY INFORMANT,Kasuwar Kofa Key Informant,PATENT_MED_VENDORS
185327,NIGERIA,Community key informant,Community Key Informant,OTHER_NON_ORTHORDOX_HC
194546,NIGERIA,Key Informants Site (Shabbo),Key Informants Site Shabbo,TRAD_SPIRIT_HEALER
165887,NIGERIA,Fatima Hassan IDP Key Informant,Fatima Hassan Idp Key Informant,TRAD_SPIRIT_HEALER
202092,NIGERIA,Bakangizo Pharmacy (Pharm. Fortune Onwumelu) -...,Bakangizo Pharmacy Pharm Fortune Onwumelu Key ...,PATENT_MED_VENDORS
200166,NIGERIA,Key Informant - Mallam Umar Sule,Key Informant Mallam Umar Sule,TRAD_SPIRIT_HEALER
203280,NIGERIA,Karen Pharmacy (Key informant),Karen Pharmacy Key Informant,PATENT_MED_VENDORS
245209,NIGERIA,Community key informant,Community Key Informant,OTHER_NON_ORTHORDOX_HC


In [184]:
word = words[14]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: wcwc


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
169272,NIGERIA,WCWC ANK,Wcwc Ank,PHC_CENTER
104500,NIGERIA,WCWC Birnin magaji,Wcwc Birnin Magaji,PHC_CENTER
162841,NIGERIA,WCWC Anka,Wcwc Anka,PHC_CENTER
209309,NIGERIA,WCWC BUNGUDU,Wcwc Bungudu,PHC_CENTER
174336,NIGERIA,WCWC Sabin Birni,Wcwc Sabin Birni,PHC_CENTER
223883,NIGERIA,WCWC Maradun,Wcwc Maradun,PHC_CENTER
175057,NIGERIA,WCWC BUNGUDU,Wcwc Bungudu,PHC_CENTER
84803,NIGERIA,WCWC BUKKUYUM,Wcwc Bukkuyum,PHC_CENTER
146987,NIGERIA,Dr karima wcwc,Dr Karima Wcwc,PHC_CENTER
173325,NIGERIA,WCWC Anka,Wcwc Anka,PHC_CENTER


In [186]:
word = words[15]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: tba


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
191623,NIGERIA,Aishetu TBA,Aishetu Tba,TRAD_SPIRIT_HEALER
197965,NIGERIA,Saude TBA,Saude Tba,OTHER_NON_ORTHORDOX_HC
242869,NIGERIA,Koji Zhira Tukuli TBA,Koji Zhira Tukuli Tba,OTHER_NON_ORTHORDOX_HC
244789,NIGERIA,EDWIN TBA,Edwin Tba,OTHER_NON_ORTHORDOX_HC
192444,NIGERIA,TBA Ekeremor,Tba Ekeremor,TRAD_SPIRIT_HEALER
201579,NIGERIA,Paulina Obed TBA,Paulina Obed Tba,OTHER_NON_ORTHORDOX_HC
192099,NIGERIA,Ede TBA Odi,Ede Tba Odi,TRAD_SPIRIT_HEALER
192813,NIGERIA,TBA Aleibiri,Tba Aleibiri,OTHER_NON_ORTHORDOX_HC
247351,NIGERIA,HAJARA TBA,Hajara Tba,OTHER_NON_ORTHORDOX_HC
202559,NIGERIA,Halima TBA,Halima Tba,TRAD_SPIRIT_HEALER


In [187]:
word = words[16]
print("word:", word)
sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(word+' | '+word, case=False))][cols].sample(10)

word: mal


Unnamed: 0,Country,name_of_facility_visited,clean_name_final,type_of_facility_visited
243470,NIGERIA,FANNA MALLUM,Fanna Mallum,OTHER_NON_ORTHORDOX_HC
244784,NIGERIA,NPHC malisa,Nphc Malisa,PHC_CENTER
202105,NIGERIA,Mal.jamilu musa,Mal Jamilu Musa,OTHER_NON_ORTHORDOX_HC
155519,NIGERIA,PHONE Malaete,Phone Malaete,PHC_CENTER
198503,NIGERIA,Albert B Mali Chimbi,Albert B Mali Chimbi,TRAD_SPIRIT_HEALER
196459,NIGERIA,Mal. Inusa madugu(CKI),Mal Inusa Madugu Cki,TRAD_SPIRIT_HEALER
215988,NIGERIA,Mono Malam isa,Mono Malam Isa,PHC_CENTER
175861,NIGERIA,Mal Muazu,Mal Muazu,OTHER_NON_ORTHORDOX_HC
190683,NIGERIA,SUNNA ANIMAL DRUG,Sunna Animal Drug,PATENT_MED_VENDORS
197435,NIGERIA,Furera mal. Muktari,Furera Mal Muktari,SPIRIT_HEALER


In [None]:
country = above_10k[0]
print("Country name:", country)
print("Top 20 words:")
for k,v in islice(freq_dict[country].items(), 20):
    if v>50:
        print(k,v)
        
type_dict[type_dict['Country'].str.upper()==country]

sample[(sample['Country']==country) & 
       (sample['clean_name_final'].str.contains(words[0], case=False))][cols].sample(10)

# New Additions

In [8]:
# New rows to be added
additions = [["South Sudan", "Clinic", "CLINIC", np.nan],
             ["Cameroon", "Cabinet de Soins", "CABINET SOINS", np.nan],
             ["Gabon", "Infirmerie", "INFIRMERIE", np.nan],
             ["Gabon", "Cabinet de Soins", "CABINET SOINS", np.nan],
             ["Niger", "Case de Sante", "CASE SANTE", np.nan],
             ["Congo", "Cabinet de Soins", "CABINET SOINS", np.nan],
             ["Congo", "Clinic", "CLINIC", np.nan],
             ["Benin", "Cabinet de Soins", "CABINET SOINS", np.nan],
             ["Mali", "Cabinet Medical", "CABINET MEDICAL", np.nan],
             ["Mali", "Traditherapeute", 'TRADITHERAPEUTE', np.nan],
             ['Nigeria', 'Maternity', 'MATERNITY', np.nan],
             ['Nigeria', 'Town Maternity', 'TOWN MATERNITY', np.nan],
             ['Nigeria', 'Nursing and Maternity Home', 'Nursing and Maternity Home', np.nan],
             ['Nigeria', 'Nursing Home', 'Nursing Home', np.nan],
             ['Nigeria', 'Chemist', 'Chemist', np.nan]]

In [9]:
type_dict_additions = pd.DataFrame(additions, columns = ['Country', 'Type', 'Abbreviation', 'count'])
type_dict_additions.head()

Unnamed: 0,Country,Type,Abbreviation,count
0,South Sudan,Clinic,CLINIC,
1,Cameroon,Cabinet de Soins,CABINET SOINS,
2,Gabon,Infirmerie,INFIRMERIE,
3,Gabon,Cabinet de Soins,CABINET SOINS,
4,Niger,Case de Sante,CASE SANTE,


In [10]:
type_dict.shape

(401, 4)

In [12]:
# Add new rows to type dictionary
type_dict = pd.concat([type_dict, type_dict_additions])

# Spelling Check

In [13]:
# hospital
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\spelling check"
filename = "hospital.txt"
file1 = open(os.path.join(saveDir, filename),"w")
# obtain a list of words that appear in clean_name_final
clean_names = ' '.join(list(sample['clean_name_final'].str.lower())).split()
# keep just words that start with h and have length greater than 4
names_h = [name for name in clean_names if name.startswith('h') and len(name)>4]
file1.write(' '.join(names_h))
file1.close() 

# generate word frequency dictionary
sym_spell = SymSpell()
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\spelling check"
filename = "hospital.txt"
corpus_path = os.path.join(saveDir, filename)
sym_spell.create_dictionary(corpus_path)
d = sym_spell.words

# compute similarity score with respect to the original word
hospital_spellings = []
for word in d.keys():
    if fuzz.ratio(word, 'hospital')>80:
        hospital_spellings.append(word)
        print(word)

hospial
hosipital
haspital
hpspital
hosital
hodpital
hopistal
hospita
hispital
hosfital
hosputal
hospitl
hoslital
hospitak
hospitsl
hospitaal
hospital


In [14]:
# clinic
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\spelling check"
filename = "clinic.txt"
file1 = open(os.path.join(saveDir, filename),"w")
clean_names = ' '.join(list(sample['clean_name_final'].str.lower())).split()
names_c = [name for name in clean_names if name.startswith('c') and len(name)>=4]
file1.write(' '.join(names_c))
file1.close() 

sym_spell = SymSpell()
saveDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\Africa\spelling check"
filename = "clinic.txt"
corpus_path = os.path.join(saveDir, filename)
sym_spell.create_dictionary(corpus_path)
d = sym_spell.words

clinic_spellings = []
for word in d.keys():
    if fuzz.ratio(word, 'clinic')>80:
        clinic_spellings.append(word)
        print(word)

clinica
clinical
clinic
cilinic
clini
clinc
clnic
chinic
clunic
clanic
cinic
clinit
clinlc
clinin
chlinic
clinix


In [16]:
# correct misspellings 
sample['clean_name_final'] = sample['clean_name_final'].str.replace('|'.join(hospital_spellings),'hospital',
                                                                            case=False)\
.str.replace('|'.join(clinic_spellings),'clinic',case=False)

In [19]:
def remove_type_info(df, type_dict, clean_name, clean_name_final, country):
    # remove whitespace between abbreviations of length 2 or 3
    # e.g. change C S to CS
    
    # obtain abbreviations of length 2 or 3
    tmp = type_dict[type_dict['Abbreviation'].str.len()<=3]['Abbreviation'].unique()
    # sort by decreasing length
    tmp = sorted(tmp, key=len, reverse=True)
    # change it to the pattern '^c s ' or ' c s$'
    tmp_dict = {}
    for t in tmp:
        tmp_dict[t] = ['^'+' '.join(list(t))+' ', ' '+' '.join(list(t))+'$']
    # replace the pattern with 'cs'
    for t in tmp:
        pats = tmp_dict[t]
        df[clean_name] = df[clean_name].str.replace(pats[0], t+' ',case=False)\
        .str.replace(pats[1], ' '+t, case=False)
        
    # remove type information
    df_grouped = df.groupby(country)
    res = pd.DataFrame()

    for group_name, df_group in df_grouped:
        # obtain the type dictionary for that country
        tmp = type_dict[type_dict['Country'].str.upper()==group_name.upper()]

        # facility types for that country
        types = list(tmp['Type'])
        type_keywords = set()
        for t in types:
            # add the full facility type 
            t = t.title()
            type_keywords.add(t)                 

            # add individual words as well
            t = t.replace('/', ' ')
            words = t.split(' ')
            # skip words that have punctuation / numbers and have length <= 3 (e.g. de, (major))
            words = [w for w in words if w.isalpha() and len(w)>3]
            for w in words:
                type_keywords.add(w)

        # obtain the list of type keywords and sort in descending length
        type_keywords = list(type_keywords)
        type_keywords = sorted(type_keywords, key=lambda s: -len(s))

        # abbreviations for that country
        abbrevs = set(tmp['Abbreviation'])

        abb_keywords = []
        for abbrev in abbrevs:
            # e.g. for CS, 4 patterns are considered: '^CS ', ' CS ', ' CS$', '^CS$'
            abbrev = abbrev.title()
            abb_keywords.extend(['^'+abbrev+'\s', '\s'+abbrev+'\s', '\s'+abbrev+'$',
                                '^'+abbrev+'$'])

        # obtain the list of abbreviation keywords and sort in descending length
        abb_keywords = sorted(abb_keywords, key=lambda s: -len(s))  

        # some country-specific adjustments
        if group_name == 'UGANDA':
            df_group['clean_name'] = df_group['clean_name'].str.replace("HC II$", "HCII", case=False)\
            .str.replace("HC III$", "HCIII", case=False)\
            .str.replace("HC IV$", "HCIV", case=False)

        if group_name == 'MALAWI':
            df_group['clean_name'] = df_group['clean_name'].str.replace(" DHO$", " DH", case=False)

        if group_name == "ERITREA":
            df_group['clean_name'] = df_group['clean_name'].str.replace(" HO$", " HOSP", case=False)

        if group_name == 'MADAGASCAR':
            df_group['clean_name'] = df_group['clean_name'].str.replace("csb 1", " csb1", case=False)
            df_group['clean_name'] = df_group['clean_name'].str.replace("csb 2", " csb2", case=False)

        # handle situations when type is 'Hospital District' in the type dictionary 
        # but name column has 'District Hospital' in ISS data
        type_len_2 = [t for t in type_keywords if len(t.split())==2]
        for t in type_len_2:
            df_group[clean_name] = df_group[clean_name].str.title()\
            .str.replace(' '.join(t.split()[::-1]), t, case=False)

        # remove type information using keywords generated above
        # remove meaningless connecting words like de, do, da, du
        df_group[clean_name_final] = df_group[clean_name].str.title()\
            .str.replace('|'.join(type_keywords), '')\
            .str.replace('|'.join(abb_keywords), ' ')\
            .str.strip()\
            .str.replace('^de | de | de$|^de$|^do | do | do$|^do$|^da | da | da$|^da$|^du | du | du$|^du$', 
                         ' ', case=False)\
            .str.replace("  ", " ")\
            .str.strip()\
            .str.title()
        res = pd.concat([res, df_group])
    return res

In [20]:
def extract_type(df, clean_name, clean_name_final, extract_type):
    extract_types = []

    for idx, row in df.iterrows():
        name = row[clean_name].upper()
        name_final = row[clean_name_final].upper()

        # if clean_name_final is exactly the same as clean_name,
        # this indicates no type information can be extracted, thus append NA
        if name.upper() == name_final.upper():
            extract_types.append(np.nan)

        else:
            name = OrderedSet(name.split())
            name_final = OrderedSet(name_final.split())
            # find the difference between two names
            diff = ' '.join(list(name.difference(name_final)))
            extract_types.append(diff.strip())

    # remove de, do, da, du at start or end of extract_type
    # replace empty string with NA
    df[extract_type] = extract_types
    df[extract_type] = df[extract_type].str.strip()\
        .str.replace("  ", " ")\
        .str.replace('^de |^do |^da |^du | du$| de$| do$| da$|^de$|^do$|^da$|^du$', '', case=False)\
        .str.replace('^de |^do |^da |^du | du$| de$| do$| da$|^de$|^do$|^da$|^du$', '', case=False)\
        .str.strip()\
        .str.title()\
        .replace('',np.nan)
    # replace empty string with NA
    df['clean_name_final'].replace('', np.nan, inplace=True)

In [21]:
def map_type(df, country, extract_type, sub_type, score, type_dict):
    df_grouped = df.groupby(country)
    res = pd.DataFrame()
    for country_name in df[country].unique():
        df_group = df[df[country]==country_name]
        # obtain facility types and abbreviations for that country
        tmp = type_dict[type_dict['Country'].str.upper()==country_name.upper()]
        types, abbrevs = tmp['Type'], tmp['Abbreviation']
        sub_types = []
        scores = []

        for idx, row in df_group.iterrows():
            # if extract_type is NA, just append NA
            if not isinstance(row[extract_type],str):
                sub_types.append(np.nan)
                scores.append(np.nan)

            # find best match
            else:
                match, match_score = process.extractOne(row[extract_type], list(types)+list(abbrevs), 
                                               scorer = fuzz.ratio)
                scores.append(match_score)
                # if best match is abbreviation, map it to the corresponding type
                if match in list(abbrevs):
                    match_type = tmp[tmp['Abbreviation']==match]['Type'].iloc[0]
                    sub_types.append(match_type)
                else:
                    sub_types.append(match) 
        df_group[sub_type] = sub_types
        df_group[score] = scores
        res = pd.concat([res, df_group])
    return res

In [22]:
# remove type information
res = remove_type_info(sample, type_dict=type_dict, clean_name='clean_name_final', 
                       clean_name_final='clean_name_final', country='Country')

In [26]:
# obtain facility type extracted
extract_type(df=res, clean_name='clean_name', 
             clean_name_final='clean_name_final', extract_type='extract_type')

In [27]:
print("Percentage of NA in extract type column:",
     round(res['extract_type'].isna().sum()/res.shape[0]*100,1))
print("Number of NA values in extract type column:", res[pd.isna(res['extract_type'])].shape[0])

Percentage of NA in extract type column: 85.1
Number of NA values in extract type column: 21036


In [30]:
# map facility type extracted to type in type dictionary
res = map_type(df=res, country = 'Country', extract_type='extract_type', 
               sub_type='sub_type', score='score', type_dict=type_dict)

In [31]:
print("Summary statistics of match score:")
res['score'].describe()

Summary statistics of match score:


count    3672.000000
mean       94.676198
std        13.305599
min        40.000000
25%       100.000000
50%       100.000000
75%       100.000000
max       100.000000
Name: score, dtype: float64

In [33]:
# randomly sample rows to examine results
cols = ['Country', 'name_of_facility_visited', 'clean_name', 'clean_name_final', 'extract_type',
       'sub_type', 'score']
res[~pd.isna(res['extract_type'])][cols].sample(5)

Unnamed: 0,Country,name_of_facility_visited,clean_name,clean_name_final,extract_type,sub_type,score
54351,MALI,Cabinet médical fraternité,Cabinet Medical Fraternite,Fraternite,Cabinet Medical,Cabinet Medical,100.0
180954,NIGERIA,Shakatafi chemist,Shakatafi Chemist,Shakatafi,Chemist,Chemist,100.0
32982,GABON,Infirmerie de la CEB,Infirmerie De La Ceb,La Ceb,Infirmerie,Infirmerie,100.0
194308,NIGERIA,Alheri Chemist Ingawa,Alheri Chemist Ingawa,Alheri Ingawa,Chemist,Chemist,100.0
201542,NIGERIA,TBA HOME,Tba Home,Tba,Home,Nursing Home,50.0


In [39]:
# Add new results back to the whole dataset
res = pd.concat([res, df_iss[~((pd.isna(df_iss['extract_type']))&(df_iss['clean_name_final'].str.count(' ')>0))]])

In [40]:
res.shape

(305182, 51)

In [41]:
# Export results
cols= ['index', 'Country', 'clean_name', 'clean_name_final',
       'extract_type', 'sub_type', 'score']
saveDir = r'C:\\Users\\DUANYUEYUN\\Documents\\GRID3\\Health facilities\\Data\\Africa'
res[cols].to_csv(saveDir+"\\clean_names_types_1130.csv", index=False)

In [42]:
# Export results
type_dict.to_csv(saveDir+'\\type_dict_augmented_1130.csv', index=False)