### Specify a text string to examine with NEMO

In [72]:
# specify query string
payload = 'The World Health Organization on Sunday reported the largest single-day increase in coronavirus cases by its count, at more than 183,000 new cases in the latest 24 hours. The UN health agency said Brazil led the way with 54,771 cases tallied and the U.S. next at 36,617. Over 15,400 came in in India.'
payload = 'is strongly affected by large ground-water withdrawals at or near Tupelo, Aberdeen, and West Point.'
# payload = 'Overall design: Teliospores of pathogenic races T-1, T-5 and T-16 of T. caries provided by a collection in Aberdeen, ID, USA'
payload = 'The results provide evidence of substantial population structure in C. posadasii and demonstrate presence of distinct geographic clades in Central and Southern Arizona as well as dispersed populations in Texas, Mexico and South and Central America'
payload = 'Most frequent numerical abnormalities in B-NHL were gains of chromosomes 3 and 18, although gains of chromosome 3 were less prominent in FL.'

### Load functions

In [73]:
# import credentials file
import yaml
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

In [74]:
# general way to extract values for a given key. Returns an array. Used to parse Nemo response and extract wikipedia id
# from https://hackersandslackers.com/extract-data-from-complex-json-python/

def extract_values(obj, key):
    """Pull all values of specified key from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    results = extract(obj, arr, key)
    return results

In [75]:
# getting wikipedia ID
# see he API at https://www.mediawiki.org/wiki/API:Query#Example_5:_Batchcomplete
# also, https://stackoverflow.com/questions/37024807/how-to-get-wikidata-id-for-an-wikipedia-article-by-api

def get_WPID (name):
    import json
    url = 'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&format=json&titles=' +name
    r=requests.get(url).json()
    return extract_values(r,'wikibase_item')
    

### Send a request to NEMO, and get a response

In [76]:
# make a service request
import requests

# payloadutf = payload.encode('utf-8')

url = "https://nemoservice.azurewebsites.net/nemo?appid=" + cfg['api_creds']['nmo1']
newHeaders = {'Content-type': 'application/json', 'Accept': 'text/plain'}
response = requests.post(url,
                         data='"{' + payload + '}"',
                         headers=newHeaders)

In [77]:
# display the results as string (remove json braces)
a = response.content.decode()
resp_full = a[a.find('{')+1 : a.find('}')]
resp_full

'Most frequent numerical abnormalities in <e ref="B-NHL" type="U" name="B-NHL" form="B-NHL" wp="n">B-NHL</e> were gains of chromosomes 3 and 18, although gains of <c ref="chromosome" type="U" name="chromosome" form="chromosome" wp="n">chromosome</c> 3 were less prominent in <e ref="Florida" type="G" name="Florida" form="FL." wp="y">FL.</e>'

### Parse the response and load all found elements into a dataframe

In [78]:
# create a dataframe with entities, remove duplicates, then add wikipedia/wikidata concept IDs
import pandas as pd
import re
import xml.etree.ElementTree as ET
df = pd.DataFrame(columns=["Type","Ref","EntityType","Name","Form","WP","Value","Alt","WP_ID"])

# note that the last column is to be populated later, via Wikipedia API
# all previous columns are from Nemo: based on "e" (entity) and "d" (data) elements. "c" (concept) to be explored

In [79]:
# get starting and ending positions of xml fragments in the Nemo output
pattern_start = "<(e|d|c)\s"
iter = re.finditer(pattern_start,resp_full)
indices1 = [m.start(0) for m in iter]
pattern_end = "</(e|d|c)>"
iter = re.finditer(pattern_end,resp_full)
indices2 = [m.start(0) for m in iter]


In [80]:
# iterate over xml fragments returned by Nemo, extracting attributes from each and adding to dataframe
for i, entity in enumerate(indices1):
    a = resp_full[indices1[i] : indices2[i]+4]

    root = ET.fromstring(a)
    tag = root.tag
    attributes = root.attrib

    df = df.append({"Type":root.tag, 
                "Ref":attributes.get('ref'),
                "EntityType":attributes.get('type'),
                "Name":attributes.get('name'),
                "Form":attributes.get('form'),
                "WP":attributes.get('wp'),
                "Value":attributes.get('value'),
                "Alt":attributes.get('alt')},
               ignore_index=True)        
    


E stands for entity; 

the attribute ref gives you the title of the corresponding Wikipedia page when the attribute wp has the value “y”; 

the attribute type gives you the type of entity for known entities; the types of interest for you are G, which is geo-political entity, L – geographic form/location (such as a mountain), and F, which is facility (such as an airport).

D stands for datafield, which comprises dates, NUMEX, email addresses and URLs, tracking numbers, and so on.

C stands for concept; these appear in Wikipedia and are deemed as relevant for the input text, but they do not get disambiguated


In [81]:
# remove duplicate records from the df
df = df.drop_duplicates(keep='first')

In [82]:
# for each found entity, add wikidata unique identifiers to the dataframe

for index, row in df.iterrows():
    if (row['WP']=='y'):
        row['WP_ID'] = get_WPID(row['Name'])[0]


In [83]:
df

Unnamed: 0,Type,Ref,EntityType,Name,Form,WP,Value,Alt,WP_ID
0,e,B-NHL,U,B-NHL,B-NHL,n,,,
1,c,chromosome,U,chromosome,chromosome,n,,,
2,e,Florida,G,Florida,FL.,y,,,Q812
