In [1]:
!pip install -U pip setuptools wheel



In [2]:
!pip install -U spacy[cuda113]
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_trf

Collecting spacy[cuda113]
  Downloading spacy-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
     |████████████████████████████████| 6.0 MB 8.1 MB/s            
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
     |████████████████████████████████| 181 kB 68.7 MB/s            
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
     |████████████████████████████████| 451 kB 59.0 MB/s            
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
     |████████████████████████████████| 628 kB 65.7 MB/s            
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.

In [3]:
import numpy as np
np.random.seed(42)

In [4]:
import spacy
nlp_small = spacy.load('en_core_web_sm')
nlp_med = spacy.load('en_core_web_md')
nlp_large = spacy.load('en_core_web_lg')
nlp_trf = spacy.load('en_core_web_trf')

In [5]:
# Download nationality - country mapping
!wget -L https://raw.githubusercontent.com/knowitall/chunkedextractor/master/src/main/resources/edu/knowitall/chunkedextractor/demonyms.csv

--2021-12-05 23:46:37--  https://raw.githubusercontent.com/knowitall/chunkedextractor/master/src/main/resources/edu/knowitall/chunkedextractor/demonyms.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 43308 (42K) [text/plain]
Saving to: ‘demonyms.csv’


2021-12-05 23:46:38 (8.26 MB/s) - ‘demonyms.csv’ saved [43308/43308]



In [6]:
!pip install countryinfo

Collecting countryinfo
  Downloading countryinfo-0.1.2-py3-none-any.whl (602 kB)
[?25l     |▌                               | 10 kB 26.8 MB/s eta 0:00:01     |█                               | 20 kB 26.7 MB/s eta 0:00:01     |█▋                              | 30 kB 16.1 MB/s eta 0:00:01     |██▏                             | 40 kB 17.6 MB/s eta 0:00:01     |██▊                             | 51 kB 7.8 MB/s eta 0:00:01      |███▎                            | 61 kB 8.5 MB/s eta 0:00:01      |███▉                            | 71 kB 7.2 MB/s eta 0:00:01      |████▍                           | 81 kB 8.0 MB/s eta 0:00:01      |█████                           | 92 kB 8.0 MB/s eta 0:00:01      |█████▍                          | 102 kB 7.3 MB/s eta 0:00:01     |██████                          | 112 kB 7.3 MB/s eta 0:00:01     |██████▌                         | 122 kB 7.3 MB/s eta 0:00:01     |███████                         | 133 kB 7.3 MB/s eta 0:00:01     |███████▋             

In [7]:
import pandas as pd
from countryinfo import CountryInfo
from collections import Counter

In [8]:
nationality_df = pd.read_csv('demonyms.csv', header=None)
demonyms = {nationality.lower(): country.lower()  for nationality, country in zip(nationality_df[0], nationality_df[1])}

In [9]:
country = CountryInfo()
all_countries = country.all()
country_lat_longs = {}
country_alt_names = {}
lower_to_original_country_map = {}
for country in all_countries:
  country_dict = all_countries[country]
  country_name = country_dict['name'].lower()
  lower_to_original_country_map[country_name] = country_dict['name']
  if 'demonym' in country_dict:
    demonym = country_dict['demonym'].lower()
    if demonym not in demonyms: 
      demonyms[demonym] = country_name
  ISO_dict = country_dict['ISO']
  alpha2, alpha3 = ISO_dict['alpha2'].lower(), ISO_dict['alpha3'].lower()
  alt_names = set()
  alt_names.update([alpha2, alpha3])
  if 'altSpellings' in country_dict:
    alt_names.update(list(map(lambda x: x.lower(), country_dict['altSpellings'])))
  country_alt_names[country_name] = alt_names
  if 'latlng' in country_dict:
    country_lat_longs[country_name] = country_dict['latlng']
  elif 'capital_latlng' in country_dict:
    country_lat_longs[country_name] = country_dict['capital_latlng']

In [10]:
demonyms_list = list(demonyms.keys())
demonyms_list.sort()

In [11]:
from functools import partial
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ner")
geocode = partial(geolocator.geocode, language="en")

In [12]:
def get_location_from_place(place):
  return geocode(place)

In [13]:
def get_country_from_location(location):
  if location:
    return location.address.split(', ')[-1].lower()
  return None

In [14]:
def get_lat_long_from_location(location):
  if location:
    return [location.latitude, location.longitude]
  return None

In [15]:
def get_lat_long_from_country(country):
  if country:
    if country in country_lat_longs:
      return country_lat_longs[country]
    else:
      geocode_obj = geocode(country)
      return get_lat_long_from_location(geocode_obj)
  return None

In [16]:
import random
random.seed(42)

In [17]:
# We add random jitter so points don't stack on top of each other
def add_jitter_to_lat_long(lat_long, jitter_range=0.5):
  if lat_long:
    jitters = (random.uniform(-jitter_range, jitter_range),  random.uniform(-jitter_range, jitter_range))
    lat_with_jitter, long_with_jitter = lat_long[0] + jitters[0], lat_long[1] + jitters[1]
    return [lat_with_jitter, long_with_jitter]
  return None

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import os
os_path = os.getcwd()

In [19]:
EXCEL_PATH = "FSN EGM IE data for ML group.xlsx"

In [21]:
#@title Select variables
sort_variable= 'intervention' #@param ['intervention', 'Intervention_subdomain']
file_path = join(os_path, EXCEL_PATH)
data = pd.read_excel(file_path, sheet_name= 'ie_dta',  engine= 'openpyxl')

In [22]:
TITLE_NAME = 'title_name'
ABSTRACT = 'abstract'
INTERVENTION_DOMAIN = 'Intervention_domain'
INTERVENTION_SUBDOMAIN = 'Intervention_subdomain'
INTERVENTION_DESC = 'Intervention_description'
COUNTRY_NAME = 'country_name'
INTERVENTION = 'intervention'
OUTCOME_TYPE = 'outcome_type'
OUTCOME_DESC = 'Outcome_description'
OUTCOME_GROUP_X = 'outcome_group X'
OUTCOME_SUBGROUP = 'outcome__subgroup'
TITLE_ABSTRACT = 'title_abstract'
TITLE_ABSTRACT_INTERVENTION_DESC = 'title_abstract_intervention_desc'
TITLE_ABSTRACT_OUTCOME_DESC = 'title_abstract_outcome_desc'
TITLE_ABSTRACT_INTERVENTION_DESC_OUTCOME_DESC = 'title_abstract_intervention_desc_outcome_desc'

In [23]:
# Keep only relevant columns
keep_col=[TITLE_NAME, 
          ABSTRACT, 
          COUNTRY_NAME, 
          INTERVENTION_DOMAIN, 
          INTERVENTION_SUBDOMAIN, 
          INTERVENTION_DESC,
          INTERVENTION, 
          OUTCOME_TYPE,
          OUTCOME_DESC,
          OUTCOME_GROUP_X,
          OUTCOME_SUBGROUP]
data=data[keep_col]

# Remove rows that don't contain an abstract, Intervention_description, or Outcome_description
data = data.dropna(subset=[ABSTRACT, INTERVENTION_DESC, OUTCOME_DESC])

# add combined title and abstract column
title_abstract = data.loc[:,(TITLE_NAME)]+ '. ' + data.loc[:,(ABSTRACT)]
data[TITLE_ABSTRACT] = title_abstract

# add combined title abstract, and intervention desc
title_abstract_intervention_desc = data.loc[:,(TITLE_NAME)]+ '. ' + data.loc[:,(ABSTRACT)] + ' ' + data.loc[:,(INTERVENTION_DESC)]
data[TITLE_ABSTRACT_INTERVENTION_DESC] = title_abstract_intervention_desc

# add combined title abstract, and outcome desc
title_abstract_outcome_desc = data.loc[:,(TITLE_NAME)]+ '. ' + data.loc[:,(ABSTRACT)] + ' ' + data.loc[:,(OUTCOME_DESC)]
data[TITLE_ABSTRACT_OUTCOME_DESC] = title_abstract_outcome_desc

# add combined title abstract, and column
title_abstract_intervention_desc_outcome_desc = data.loc[:,(TITLE_NAME)]+ '. ' + data.loc[:,(ABSTRACT)] + ' ' + data.loc[:,(INTERVENTION_DESC)] + ' ' + data.loc[:,(OUTCOME_DESC)]
data[TITLE_ABSTRACT_INTERVENTION_DESC_OUTCOME_DESC] = title_abstract_intervention_desc_outcome_desc

# Remove accent marks
cols = data.select_dtypes(include=[np.object]).columns
data[cols] = data[cols].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))

#remove nans
data = data.fillna('')

# Sort data by intervention_subdomain

# actually sort by intervention
sorted_data= data.sort_values(by=[sort_variable])

interventions=[]
inter_options= sorted_data[sort_variable]


for index, val in inter_options.iteritems() :
    if val not in interventions:
        interventions.append(val)

In [24]:
data[COUNTRY_NAME] = data[COUNTRY_NAME].str.lower()

In [25]:
country_country_map = {
    "ca te d'ivoire": 'ivory coast',
    'congo, dem. rep.': 'democratic republic of the congo',
    'gambia, the': 'the gambia',
    'iran, islamic rep.': 'iran',
    'yemen, rep.': 'yemen',
    'lao pdr': 'laos',
}
for key, value in country_country_map.items():
  lower_to_original_country_map[value] = value.title()

In [26]:
def process_country(country):
  if country in country_country_map:
    return country_country_map[country]
  return country

In [27]:
data[COUNTRY_NAME] = data[COUNTRY_NAME].apply(process_country)

In [28]:
corpus_countries_set = set(data[COUNTRY_NAME].unique())
all_countries_set = set(list(country_alt_names.keys()))
nonoverlapping_countries = corpus_countries_set - all_countries_set

In [29]:
nonoverlapping_countries

{'kosovo', 'myanmar'}

In [30]:
nonoverlapping_countries_alt_names = {
    'kosovo': {'republic of kosovo'},
    'myanmar': {'burma', 'republic of the union of myanmar'}
}
country_alt_names.update(nonoverlapping_countries_alt_names)
lower_to_original_country_map['kosovo'], lower_to_original_country_map['myanmar'] = 'Kosovo', 'Myanmar (Burma)'

In [31]:
all_countries_list_final = list(country_alt_names.keys())
all_countries_list_final.sort()

In [32]:
for country in nonoverlapping_countries:
  location = get_location_from_place(country)
  country_location = get_country_from_location(location)
  lat_long = get_lat_long_from_location(location)
  country_lat_longs[country] = lat_long

In [33]:
spacy_ner_types = [
                   "PERSON", # People including fictional
                   "NORP", # Nationalities, or religious or political groups
                   "FAC", # Buildings, airports, highways, bridges, etc.
                   "ORG", # Companies, agencies, institutions, etc.
                   "GPE", # Countries, cities, states
                   "LOC", # Non-GPE locations, mountain ranges, bodies of water
                   "PRODUCT", # Objects, vehicles, foods, etc. 
                   "EVENT", # Named Hurricanes, battles, wars, sport events
                   "WORK_OF_ART", # Title of books, songs, etc.
                   "LAW", # Named documents made into laws
                   "LANGUAGE", # Any named language
                   "DATE", # Absolute or relative dates or periods
                   "TIME", # Times smaller than a day
                   "PERCENT", # Percentage, including '%'
                   "MONEY", # Monetary values, including unit.
                   "QUANTITY", # Measurements, as of weight or distance,
                   "ORDINAL", # "first", "second", etc.
                   "CARDINAL", # Numerals that do not fall under another type
                   ]

In [34]:
def get_ner_dicts(data_df, ner_model, col_name=ABSTRACT,):
  ner_dicts = []
  for desc in data_df[col_name].unique():
    desc_ner_dict = {ner_type: [] for ner_type in spacy_ner_types}
    doc = ner_model(desc)
    named_entities = [(entity.text, entity.label_) for entity in doc.ents]
    desc_ner_dict['ner_type_dict'] = {text: label for text, label in named_entities}
    for text, label in desc_ner_dict['ner_type_dict'].items():
      desc_ner_dict[label].append(text)
    desc_ner_dict['counter'] = Counter([named_entity[0] for named_entity in named_entities])
    ner_dicts.append(desc_ner_dict)
  return ner_dicts

In [35]:
def get_country_from_substring_checking(input_feature):
  input_feature = input_feature.lower()
  for country in all_countries_list_final:
    if country in input_feature:
      return country
  for nationality in demonyms_list:
    if nationality in input_feature:
      return demonyms[nationality]
  return None

In [36]:
def get_country_from_ner_dict(ner_dict):
  norps, gpes = {norp.lower() for norp in ner_dict['NORP']}, {gpe.lower() for gpe in ner_dict['GPE']}
  alt_words_norps = {demonyms[norp.lower()] if norp.lower() in demonyms else norp.lower() for norp in norps}
  norps_and_gpes = list(alt_words_norps | gpes)
  norps_and_gpes.sort()
  for norp_or_gpe in norps_and_gpes:
    if norp_or_gpe in all_countries_list_final:
      return norp_or_gpe
  for norp_or_gpe in norps_and_gpes:
    try:
      location = get_location_from_place(norp_or_gpe)
      country = get_country_from_location(location)
      if country:
        return country
    except:
      continue
  return None

In [37]:
def print_ner_dicts(ner_dict_list):
  for ner_dict in ner_dict_list:
    print(ner_dict)

# Pretrained SpaCy NER Models

In [38]:
NER_SMALL, NER_MEDIUM, NER_LARGE, NER_TRANSFORMER = 'english_small', 'english_medium', 'english_large', 'english_transformer'
NER_MODEL_NAMES = [NER_SMALL, NER_MEDIUM, NER_LARGE, NER_TRANSFORMER]

In [39]:
SUBSTRING_CHECKER = 'substring_checker'
NER_MODELS_DICT = {
    SUBSTRING_CHECKER: None,
    NER_SMALL: nlp_small,
    NER_MEDIUM: nlp_med,
    NER_LARGE: nlp_large,
    NER_TRANSFORMER: nlp_trf,
}

NER_PRED_RESULTS = {model_name: {} for model_name in NER_MODEL_NAMES}
NER_PRED_RESULTS[SUBSTRING_CHECKER] = {}

In [40]:
input_features = [
                  TITLE_NAME, 
                  ABSTRACT, 
                  INTERVENTION_DESC, 
                  OUTCOME_DESC, 
                  TITLE_ABSTRACT,
                  TITLE_ABSTRACT_INTERVENTION_DESC, 
                  TITLE_ABSTRACT_OUTCOME_DESC,
                  TITLE_ABSTRACT_INTERVENTION_DESC_OUTCOME_DESC
                  ]

In [41]:
for input_feature in input_features:
  for ner_model_name in NER_MODEL_NAMES:
    print(input_feature, ner_model_name)
    model = NER_MODELS_DICT[ner_model_name]
    res_dict = NER_PRED_RESULTS[ner_model_name]
    res_dict[input_feature] = get_ner_dicts(data, model, col_name=input_feature)

title_name english_small
title_name english_medium
title_name english_large
title_name english_transformer
abstract english_small
abstract english_medium
abstract english_large
abstract english_transformer
Intervention_description english_small
Intervention_description english_medium
Intervention_description english_large
Intervention_description english_transformer
Outcome_description english_small
Outcome_description english_medium
Outcome_description english_large
Outcome_description english_transformer
title_abstract english_small
title_abstract english_medium
title_abstract english_large
title_abstract english_transformer
title_abstract_intervention_desc english_small
title_abstract_intervention_desc english_medium
title_abstract_intervention_desc english_large
title_abstract_intervention_desc english_transformer
title_abstract_outcome_desc english_small
title_abstract_outcome_desc english_medium
title_abstract_outcome_desc english_large
title_abstract_outcome_desc english_transfo

In [42]:
for input_feature in input_features:
  input_feature_data = data[input_feature]
  NER_PRED_RESULTS[SUBSTRING_CHECKER][input_feature] = list(map(get_country_from_substring_checking, input_feature_data))

In [43]:
def compute_accuracy(pred_array, true_array):
  matches = (pred_array == true_array)
  if type(matches) == bool:
    return 0.0
  return matches.sum()/len(pred_array)

In [44]:
def prettify_underscore_string(string):
  split_by_underscore = string.split('_')
  upper_cased = list(map(lambda x: x.upper(), split_by_underscore))
  prettified_string = " ".join(upper_cased)
  return prettified_string

In [45]:
def classify_countries(ner_dicts):
  return np.array(list(map(get_country_from_ner_dict, ner_dicts)))

# **Accuracy of a pretrained NER model to classify data by country**

Note that we do not do any additionally training for the NER model, we use it out-of-the-box and use the entire dataset as a test set to classify each paper by country.

We compare models which have been trained on varying sized English corpi 

In [46]:
true_countries = data[COUNTRY_NAME].values
for input_feature in input_features:
  print(f"Results for {prettify_underscore_string(input_feature)}: ")
  for ner_model_name in list(NER_MODELS_DICT.keys()):
    if ner_model_name == SUBSTRING_CHECKER:
      pred_countries = NER_PRED_RESULTS[ner_model_name][input_feature]
    else:
      ner_dicts = NER_PRED_RESULTS[ner_model_name][input_feature]
      pred_countries = classify_countries(ner_dicts)
    accuracy = compute_accuracy(pred_countries, true_countries)
    print(f"  Accuracy using {prettify_underscore_string(ner_model_name)}: ", accuracy)
  print('-'*20)

Results for TITLE NAME: 
  Accuracy using SUBSTRING CHECKER:  0.6762295081967213
  Accuracy using ENGLISH SMALL:  0.4344262295081967
  Accuracy using ENGLISH MEDIUM:  0.5204918032786885
  Accuracy using ENGLISH LARGE:  0.48770491803278687
  Accuracy using ENGLISH TRANSFORMER:  0.7254098360655737
--------------------
Results for ABSTRACT: 
  Accuracy using SUBSTRING CHECKER:  0.7622950819672131
  Accuracy using ENGLISH SMALL:  0.7868852459016393
  Accuracy using ENGLISH MEDIUM:  0.8073770491803278
  Accuracy using ENGLISH LARGE:  0.819672131147541
  Accuracy using ENGLISH TRANSFORMER:  0.8360655737704918
--------------------
Results for INTERVENTION DESCRIPTION: 
  Accuracy using SUBSTRING CHECKER:  0.13934426229508196


  


  Accuracy using ENGLISH SMALL:  0.0
  Accuracy using ENGLISH MEDIUM:  0.0
  Accuracy using ENGLISH LARGE:  0.0
  Accuracy using ENGLISH TRANSFORMER:  0.0
--------------------
Results for OUTCOME DESCRIPTION: 
  Accuracy using SUBSTRING CHECKER:  0.028688524590163935
  Accuracy using ENGLISH SMALL:  0.028688524590163935
  Accuracy using ENGLISH MEDIUM:  0.028688524590163935
  Accuracy using ENGLISH LARGE:  0.028688524590163935
  Accuracy using ENGLISH TRANSFORMER:  0.028688524590163935
--------------------
Results for TITLE ABSTRACT: 
  Accuracy using SUBSTRING CHECKER:  0.8401639344262295
  Accuracy using ENGLISH SMALL:  0.8442622950819673
  Accuracy using ENGLISH MEDIUM:  0.8688524590163934
  Accuracy using ENGLISH LARGE:  0.8852459016393442
  Accuracy using ENGLISH TRANSFORMER:  0.9098360655737705
--------------------
Results for TITLE ABSTRACT INTERVENTION DESC: 
  Accuracy using SUBSTRING CHECKER:  0.8319672131147541
  Accuracy using ENGLISH SMALL:  0.8278688524590164
  Accuracy u

In [47]:
transformer_preds = NER_PRED_RESULTS[NER_TRANSFORMER][TITLE_ABSTRACT]

In [48]:
transformer_country_preds = classify_countries(transformer_preds)

In [49]:
COUNTRY_PREDS = 'transformer_country_preds'

In [50]:
data[COUNTRY_PREDS] = transformer_country_preds

In [51]:
transformer_country_preds_lat_longs = list(map(get_lat_long_from_country, transformer_country_preds))
transformer_country_preds_lat_longs = list(map(add_jitter_to_lat_long, transformer_country_preds_lat_longs))

In [52]:
data['pred_lat'] = np.array(list(map(lambda x: x[0] if x else None, transformer_country_preds_lat_longs)))
data['pred_long'] = np.array(list(map(lambda x: x[1] if x else None, transformer_country_preds_lat_longs)))

In [53]:
non_null_preds = data[~(data['transformer_country_preds'].isna())]
null_preds = data[data['transformer_country_preds'].isna()]

In [54]:
non_null_preds

Unnamed: 0,title_name,abstract,country_name,Intervention_domain,Intervention_subdomain,Intervention_description,intervention,outcome_type,Outcome_description,outcome_group X,outcome__subgroup,title_abstract,title_abstract_intervention_desc,title_abstract_outcome_desc,title_abstract_intervention_desc_outcome_desc,transformer_country_preds,pred_lat,pred_long
0,Impact of Farmer Field Schools on Agricultural...,Farmer field schools (FFSs) are a popular educ...,kenya,A._Food_Supply_Chain,AA._Production_system_,Farmer field schools are traditionally an adul...,AADa. Education / information - Farmer field s...,H._Intermediate,We used the value of crop production and the v...,HA._Economic,HAC._Output_value,Impact of Farmer Field Schools on Agricultural...,Impact of Farmer Field Schools on Agricultural...,Impact of Farmer Field Schools on Agricultural...,Impact of Farmer Field Schools on Agricultural...,kenya,0.875646,38.1827
9,Empowering Women: How Mexico's Conditional Cas...,Data from a controlled randomised trial are us...,mexico,B._Food_Enviroment,BA._Availability_and_affordability,Oportunidades provides cash trans- fers condit...,BAD. Cash-for-food programs,I._Final,Evaluate the impact of oportunidades (CCT) on ...,IA._Anthropometric,IAE._Birth_outcomes,Empowering Women: How Mexico's Conditional Cas...,Empowering Women: How Mexico's Conditional Cas...,Empowering Women: How Mexico's Conditional Cas...,Empowering Women: How Mexico's Conditional Cas...,mexico,23.1618,-101.961
13,Long-Term Evaluation of a Micronutrient-Fortif...,Objective: To evaluate the long-term effect on...,south africa,A._Food_Supply_Chain,AC. Processing and packaging,The aim of the present study therefore was to ...,ACA. Fortification,I._Final,The study evaluated the effect on micronutrien...,IC._Micronutrient_status_,ICA._Iron,Long-Term Evaluation of a Micronutrient-Fortif...,Long-Term Evaluation of a Micronutrient-Fortif...,Long-Term Evaluation of a Micronutrient-Fortif...,Long-Term Evaluation of a Micronutrient-Fortif...,south africa,-28.8465,23.8478
20,"Growth, Bone Mass, and Vitamin D Status of Chi...",Background: A 2-y school milk intervention tri...,china,B._Food_Enviroment,BA._Availability_and_affordability,In a school milk intervention study conducted ...,BAC. Direct provision of foods,I._Final,Height and sitting height were measured by the...,IA._Anthropometric,IAA._Linear_growth,"Growth, Bone Mass, and Vitamin D Status of Chi...","Growth, Bone Mass, and Vitamin D Status of Chi...","Growth, Bone Mass, and Vitamin D Status of Chi...","Growth, Bone Mass, and Vitamin D Status of Chi...",china,34.6785,105.037
32,Supply-Side CrowdingaOut and CrowdingaIn Effec...,The present article estimates the extent to wh...,malawi,A._Food_Supply_Chain,AA._Production_system_,The objective of the present study is to provi...,AABe. Provision of free or reduced-cost access...,H._Intermediate,volume of commercial fertilizer sales for dist...,HA._Economic,HAA._Income,Supply-Side CrowdingaOut and CrowdingaIn Effec...,Supply-Side CrowdingaOut and CrowdingaIn Effec...,Supply-Side CrowdingaOut and CrowdingaIn Effec...,Supply-Side CrowdingaOut and CrowdingaIn Effec...,malawi,-13.4712,34.2279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284,Effect of Iron-Fortified Candies on the Iron S...,Background: Iron deficiency anemia is the most...,indonesia,A._Food_Supply_Chain,AC._Processing_and_packaging_,"For 12 wk, the placebo group received nonforti...",ACA. Fortification,I._Final,Both at baseline and after 12 wk of interventi...,IC._Micronutrient_status_,ICA._Iron,Effect of Iron-Fortified Candies on the Iron S...,Effect of Iron-Fortified Candies on the Iron S...,Effect of Iron-Fortified Candies on the Iron S...,Effect of Iron-Fortified Candies on the Iron S...,indonesia,-5.17402,119.958
2290,Equity in Adherence to And Effect of Prenatal ...,Background Evidence is often missing on social...,bangladesh,B._Food_Enviroment,BA._Availability_and_affordability,Women were also individually randomized to one...,BAE. Provision or use of supplements,H._Intermediate,(a) the interviewers asked a series of questio...,HM._Behaviour_change,HMA._Behaviour_change,Equity in Adherence to And Effect of Prenatal ...,Equity in Adherence to And Effect of Prenatal ...,Equity in Adherence to And Effect of Prenatal ...,Equity in Adherence to And Effect of Prenatal ...,bangladesh,23.7624,90.3626
2300,Application of the Health Belief Model to Teac...,In Ethiopia many women do not practice appropr...,ethiopia,C._Consumer_Behaviour,CB._Information_behaviour_change_communication,Intervention households were visited fortnight...,CBB. Professional services (dieticians / nurses),I._Final,Complementary feeding practices were measured ...,ID._Diet_quality_and_adequacy,IDB._Dietary_diversity,Application of the Health Belief Model to Teac...,Application of the Health Belief Model to Teac...,Application of the Health Belief Model to Teac...,Application of the Health Belief Model to Teac...,ethiopia,8.02772,38.1391
2306,"Effectiveness of Nutrition Education, Iron Sup...","Objectives: A community-based, randomized tria...",india,C._Consumer_Behaviour,CB._Information_behaviour_change_communication,The 545 children along with their mothers were...,CBC. Community meetings,H._Intermediate,The maternal nutrition knowledge score was ass...,HH._Intrinsic_motivators,HHC._Knowledge,"Effectiveness of Nutrition Education, Iron Sup...","Effectiveness of Nutrition Education, Iron Sup...","Effectiveness of Nutrition Education, Iron Sup...","Effectiveness of Nutrition Education, Iron Sup...",india,20.097,77.1113


In [55]:
import folium
#Creating a base map
m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2.3)

In [56]:
for i in range(0, len(non_null_preds)):
   folium.Marker(
      location=[non_null_preds.iloc[i]['pred_lat'], non_null_preds.iloc[i]['pred_long']],
      popup=non_null_preds.iloc[i]['title_name'],
      # icon=folium.DivIcon(html=f"""<div style="font-family: courier new; color: blue">{non_null_preds.iloc[i]['title_name']}</div>""")
   ).add_to(m)

In [57]:
m

In [58]:
m.save('./int_dev_lit_map.html')

In [59]:
#@title
data.to_csv('./preds.csv')

In [60]:
#@title
original_to_lower = {value:key for key, value in lower_to_original_country_map.items()}
original_country_strings = list(lower_to_original_country_map.values())
original_country_strings.sort()
CoS_list = original_country_strings + [None]

## Country of Study (CoS) Predictions Filter Function

In [61]:
#@title
import ipywidgets as widgets
selected_CoS_dropdown = widgets.Dropdown(options=CoS_list,
                                         description='CoS: ',
                                         value=None,
                                         disabled=False)
selected_CoS_dropdown

Dropdown(description='CoS: ', options=('Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Angola', 'Angui…

In [66]:
#@title
selected_CoS = original_to_lower[selected_CoS_dropdown.value] if selected_CoS_dropdown.value else None
filtered_preds = data[data[COUNTRY_PREDS] == selected_CoS]
if selected_CoS is None:
  filtered_preds = data[data[COUNTRY_PREDS].isnull()]
if (len(filtered_preds) == 0): print(f'No papers found in corpus for {selected_CoS_dropdown.value}')
else:
  for i in range(len(filtered_preds)):
    print(f"{i+1}. Title:", filtered_preds.iloc[i][TITLE_NAME])
    print(f"   Abstract:", filtered_preds.iloc[i][ABSTRACT])

1. Title: A multicountry randomized controlled trial of comprehensive maternal nutrition supplementation initiated before conception: the Women First trial
   Abstract: Background: Reported benefits of maternal nutrition supplements commenced during pregnancy in low-resource populations have typically been quite limited. Objectives: This study tested the effects on newborn size, especially length, of commencing nutrition supplements for women in lowresource populations a3 mo before conception (Arm 1), compared with the same supplement commenced late in the first trimester of pregnancy (Arm 2) or not at all (control Arm 3). Methods: Women First was a 3-arm individualized randomized controlled trial (RCT). The intervention was a lipid-based micronutrient supplement; a protein-energy supplement was also provided if maternal body mass index (kg/m2) was <20 or gestational weight gain was less than recommendations. Study sites were in rural locations of the Democratic Republic of the Congo (

## Understanding Mispredictions

In [63]:
mispreds = data[data[COUNTRY_NAME] != data[COUNTRY_PREDS]]

In [64]:
# Titles of samples for which the transformer misclassified the CoS or simply predicted None
for i in range(len(mispreds)):
  print(f"{i+1}. Title: ", mispreds.iloc[i][TITLE_NAME])
  print(f'True CoS: {mispreds.iloc[i][COUNTRY_NAME]}')
  print(f'Pred CoS: {mispreds.iloc[i][COUNTRY_PREDS]}')
  print()

1. Title:  Impact Evaluation of the Agricultural Insurance Program of the Philippine Crop Insurance Corporation on Agricultural Producers in Central Visayas
True CoS: philippines
Pred CoS: None

2. Title:  I2-Carotene-Rich Orange-Fleshed Sweet Potato Improves the Vitamin A Status of Primary School Children Assessed with the Modified-Relative-Dose-Response Test
True CoS: south africa
Pred CoS: None

3. Title:  Randomized Efficacy Trial of a Micronutrient-Fortified Beverage in Primary School Children in Tanzania
True CoS: tanzania
Pred CoS: None

4. Title:  The Effect of Educational Intervention Based on The Theory of Planned Behavior on Nutritional Behavior with Regard to Cardiovascular Diseases Among Health Volunteers
True CoS: iran
Pred CoS: None

5. Title:  Effect of Multimedia Education on Nutritional Behaviour for Colorectal Cancer Prevention: An Application of Health Belief Model
True CoS: iran
Pred CoS: None

6. Title:  Nutrition Education Intervention Improves Nutrition Knowledg

In [65]:
# Title + Abstract concatenation of samples for which the transformer misclassified the CoS or simply predicted None
for i in range(len(mispreds)):
  print(f"{i+1}. Title + Abstract: ", mispreds.iloc[i][TITLE_ABSTRACT])
  print(f'True CoS: {mispreds.iloc[i][COUNTRY_NAME]}')
  print(f'Pred CoS: {mispreds.iloc[i][COUNTRY_PREDS]}')
  print()

1. Title + Abstract:  Impact Evaluation of the Agricultural Insurance Program of the Philippine Crop Insurance Corporation on Agricultural Producers in Central Visayas. This study evaluated the impact of the agricultural insurance program on agricultural producers in Central Visayas (Region VII) on the premise that agricultural crop insurance is a potential risk mitigating tool. Agricultural insurance, through the Philippine Crop Insurance Corporation, is seen as a mechanism which can be used by farmers to manage risks and improve their well-being. In support, the Cebu Provincial government through its Special Comprehensive Insurance to Agri-Fishery Stakeholders program has fully subsidized insurance premium to include accident insurance since 2009. The Cebu Provincial government allocated 8M in 2015 and 10 million in 2016 for agricultural insurance. The corn farmers need only to comply with the administrative requirements to enjoy the benefits of agricultural insurance. Data were gath