# LAB3.4 Linking entities detected by Spacy

Copyright, Vrije Universiteit Amsterdam, Faculty of Humanities, CLTL

In [99]:
from rdflib import Graph, URIRef
from tqdm import tqdm
import sys
import requests
import urllib
import urllib.parse
from urllib.request import urlopen, Request
from urllib.parse import urlencode
import xml.etree.cElementTree as ET
from lxml import etree
import time
import json
# import our own utility functions and classes
import lab3_utils as utils

In [46]:
import spacy
from spacy import displacy
nlp = spacy.load('en') # other languages: de, es, pt, fr, it, nl
#nlp = spacy.load('en_core_web_sm')

In [2]:
text = """But Google is starting from behind Europe in January. The company made a late push into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption."""
doc = nlp(text)

In [3]:
## Get all the text
print(doc.doc)

But Google is starting from behind Europe in January. The company made a late push into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.


In [49]:
# Get all the entities with the basic information
for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Google 1 2 4 10 ORG
Europe 6 7 35 41 LOC
January 8 9 45 52 DATE
Apple’s Siri 20 23 102 114 ORG
iPhones 26 27 129 136 ORG
Amazon 29 30 142 148 ORG
Echo and Dot 38 41 185 197 ORG


Given this basic information from spaCy on entities, we can now adapt our functions for AIDA and Spotlight to find the entity links.

## 1. Linking spaCy entities with AIDA

In [87]:
##### aida function to handle the entity mentions in a spaCy Doc object
from spacy.tokens import Span
def aida_disambiguation(doc, aida_url):
    """
    Perform disambiguation with AIDA.
    """
    entities=[]
    # AIDA expects entity mentions that are pre-marked inside text. 
    # For example, the sentence "Obama visited Paris today", 
    # should be transformed to "[[Obama]] visited [[Paris]] today."
    # We do this in the next 5 lines of code.
    original_content = doc.text
    #print(original_content)
    new_content=""
    current_pos=0
    for entity in doc.ents:
        print(entity.text, entity.start_char, entity.end_char, entity.label_)
        entity_span=original_content[entity.start_char: entity.end_char]
        #print(entity_span)
        new_content+=original_content[current_pos:entity.start_char] + '[[' + entity_span + ']]'
        current_pos=entity.end_char

    # Now, we can run the AIDA library with this string.
    params={"text": new_content, "tag_mode": 'manual'}
    request = Request(aida_url, urlencode(params).encode())
    this_json = urlopen(request).read().decode('unicode-escape')
    try:
        results=json.loads(this_json)
    except:
        return doc  ### if we fail, we return the doc as is
    
    # Let's normalize the disambiguated entities.
    # This means mostly removing the first part of the URI which is always the same (YAGO:)
    # and leaving only the entity identification part (e.g., Barack_Obama).
    
    dis_entities={}
    for dis_entity in results['mentions']:
        if 'bestEntity' in dis_entity.keys():
            best_entity=dis_entity['bestEntity']['kbIdentifier']
            clean_url=best_entity[5:] #SKIP YAGO:
        else:
            clean_url='NIL'
        dis_entities[str(dis_entity['offset'])] = clean_url # BECOMES THE VALUE IN THE DICTIONARY FOR THE OFFSET(REPRESENTING THE START OF THE MENTION) IN THE TEXT

    # We can now store the entity to our class instance for later processing.
    for entity in doc.ents:
        start = entity.start_char
        try:
            dis_url = str(dis_entities[str(start)])  # WE GET THE DISAMBIGUATED URL
        except:
            dis_url='NIL'
        #print(dis_url)
        # Create a spaCy Span object for each entity and add it to our entity list 
        linked_entity = Span(doc, start=entity.start, end=entity.end, label=entity.label_, kb_id=dis_url)
        entities.append(linked_entity)
          
    return entities

In [86]:
# AIDA is running in an external location - for this reason, we need to send an HTTP request. This should take a few minutes.
aida_disambiguation_url = "https://gate.d5.mpi-inf.mpg.de/aida/service/disambiguate"

processed_aida=aida_disambiguation(doc, aida_disambiguation_url)


Google 4 10 ORG
Europe 35 41 LOC
January 45 52 DATE
Apple’s Siri 102 114 ORG
iPhones 129 136 ORG
Amazon 142 148 ORG
Echo and Dot 185 197 ORG




In [72]:
## We iterate over our entities from spaCy and print out the data with the URL

for entity in processed_aida:
    print('mention: %s; offset:%s-%s; type:%s; url:%s'% (entity.text, entity.start_char, entity.end_char, entity.label_, entity.kb_id_))

mention: Google; offset:4-10; type:ORG; url:Google
mention: Europe; offset:35-41; type:LOC; url:Europe
mention: January; offset:45-52; type:DATE; url:Deaths_in_January_2013
mention: Apple’s Siri; offset:102-114; type:ORG; url:NIL
mention: iPhones; offset:129-136; type:ORG; url:NIL
mention: Amazon; offset:142-148; type:ORG; url:Amazon.com
mention: Echo and Dot; offset:185-197; type:ORG; url:NIL


## 2. Linking spaCy entities with DBPedia Spotlight

In [93]:
## Get all the text
print(doc.text)

But Google is starting from behind Europe in January. The company made a late push into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.


In [96]:
##### spotlight function to handle the entity mentions in a spaCy Doc object

def spotlight_disambiguate(doc, spotlight_url):
    """
    Perform disambiguation with DBpedia Spotlight.
    """

    entities=[]
   
    # Similar as with AIDA, we first prepare the document text and the mentions
    # in order to provide these to Spotlight as input.

    # We build up the XML structure that Spotligh wants as input
    # The next function Element creates the XML element with the text attribute
    annotation = etree.Element("annotation", text=doc.text)

    # We iterate over the eneity mentions from our Reuters data to create the surface form elements
    for entity in doc.ents:
        sf = etree.SubElement(annotation, "surfaceForm")
        sf.set("name", entity.text)
        sf.set("offset", str(entity.start_char))
    my_xml=etree.tostring(annotation, xml_declaration=True, encoding='UTF-8')
    # Send a disambiguation request to spotlight
    results=requests.post(spotlight_url, urllib.parse.urlencode({'text':my_xml, 'confidence': 0.5}), 
                          headers={'Accept': 'application/json'})
    # Note that you can adjust the confidence value. Check the online demo to see the effect. 
    # What will happen with the recall and precision if you increase the confidence?

    # Process the results and normalize the entity URIs
    j=results.json()
    dis_entities={}
    if 'Resources' in j: 
        resources=j['Resources']
    else: 
        resources=[]
    for dis_entity in resources:
        dis_entities[str(dis_entity['@offset'])] = utils.normalizeURL(dis_entity['@URI'])

    # Let's now store the URLs by Spotlight to our class for later analysis.
    for entity in doc.ents:
        start = entity.start_char
        if str(start) in dis_entities:
            dis_url = dis_entities[str(start)]
        else:
            dis_url = 'NIL'
        print(dis_url)
        linked_entity = Span(doc, start=entity.start, end=entity.end, label=entity.label_, kb_id=dis_url)
        entities.append(linked_entity)

    return entities

In [100]:
spotlight_disambiguation_url="http://model.dbpedia-spotlight.org/en/disambiguate"

processed_both=spotlight_disambiguate(doc, spotlight_disambiguation_url)

Google
Europe
January
Siri
IPhone
Amazon_Video
NIL


In [101]:
for entity in processed_aida:
    print('mention: %s; offset:%s-%s; type:%s; url:%s'% (entity.text, entity.start_char, entity.end_char, entity.label_, entity.kb_id_))

mention: Google; offset:4-10; type:ORG; url:Google
mention: Europe; offset:35-41; type:LOC; url:Europe
mention: January; offset:45-52; type:DATE; url:Deaths_in_January_2013
mention: Apple’s Siri; offset:102-114; type:ORG; url:NIL
mention: iPhones; offset:129-136; type:ORG; url:NIL
mention: Amazon; offset:142-148; type:ORG; url:Amazon.com
mention: Echo and Dot; offset:185-197; type:ORG; url:NIL


## End of this notebook