In [1]:
import dataparser
import time
import pickle
import load_utils

## 1) Load dataset(s) and show some basic stats

In [2]:
rss_title='rss500'
rss_file='data/N3/RSS-500.ttl'
reuters_title='reuters128'
reuters_file='data/N3/Reuters-128.ttl'

n3_title='n3'

limit=100000000 # for testing purposes

In [3]:
rss_articles=dataparser.load_article_from_nif_file(rss_file, collection=rss_title, limit=limit)

reuters_articles=dataparser.load_article_from_nif_file(reuters_file, collection=reuters_title, limit=limit)

n3_articles=reuters_articles | rss_articles

In [4]:
print('N3: %d articles loaded!' % len(n3_articles))

N3: 628 articles loaded!


In [5]:

wes_title='wes2015'
wes_file="data/wes2015-dataset-nif-1.2.rdf"

wes_articles=dataparser.load_article_from_nif_file(wes_file, limit=limit, collection=wes_title)
print('WES2015: %d articles loaded!' % len(wes_articles))

aida_title='aida'
aida_file='data/AIDA-YAGO2-dataset_topicsLowlevel.tsv'

aida_articles=dataparser.load_article_from_conll_file(aida_file)

print('AIDA: %d articles loaded!' % len(aida_articles))

WES2015: 366 articles loaded!
AIDA: 1393 articles loaded!


In [6]:
num_mentions_n3 = sum(len(article.entity_mentions) for article in n3_articles)
print("N3: %d mentions in total!" % num_mentions_n3)

num_mentions_wes = sum(len(article.entity_mentions) for article in wes_articles)
print("WES2015: %d mentions in total!" % num_mentions_wes)

num_mentions_aida = sum(len(article.entity_mentions) for article in aida_articles)
print("AIDA: %d mentions in total!" % num_mentions_aida)



N3: 1880 mentions in total!
WES2015: 28587 mentions in total!
AIDA: 34929 mentions in total!


In [7]:
load_utils.store_dataset(aida_title, aida_articles)
load_utils.store_dataset(wes_title, wes_articles)
#store_dataset(rss_title, rss_articles)
#store_dataset(reuters_title, reuters_articles)
load_utils.store_dataset(n3_title, n3_articles)

**1.1. Inspect encoding**

In [None]:
for a in rss_articles:
    for entity in a.entity_mentions:
        gold=entity.gold_link
        print(gold)

Miami_International_Airport
--NME--
Cleveland_Indians
Eric_Wedge
Marshall_University
Doc_Holliday_(American_football)
Boston_Red_Sox
Johnny_Pesky
--NME--
Cincinnati_Children's_Hospital_Medical_Center
Central_Intelligence_Agency
George_Tenet
Federal_Highway_Administration
Victor_Mendez
Larry_Fitzgerald
Uganda
United_States_Department_of_Homeland_Security
Daryl_Johnson
LSU_Tigers_and_Lady_Tigers
Les_Miles
Darrell_Brock_Jr.
Republican_Party_of_Kentucky
China
Senkaku_Islands
--NME--
--NME--
LSU_Tigers_and_Lady_Tigers
Les_Miles
Department_for_Education
--NME--
Patient_Protection_and_Affordable_Care_Act
Patient_Protection_and_Affordable_Care_Act
--NME--
Paul_Ryan
CBS_Sports
Boomer_Esiason
--NME--
Michigan_Supreme_Court
Conway,_Arkansas
--NME--
--NME--
Henry_H._Kennedy,_Jr.
Lewis_County,_West_Virginia
Ireland,_West_Virginia
--NME--
--NME--
--NME--
Boomer_Esiason
--NME--
--NME--
Federal_Highway_Administration
Victor_Mendez
--NME--
--NME--
Daniel_Yergin
--NME--
Baltimore_Orioles
Josh_Beckett
Du

### 2) AGDISTIS annotation

In [None]:
for articles in [aida_articles, n3_articles]:
    c=0
    for article in articles:
        original_content = article.content
        new_content=original_content
        for entity in reversed(article.entity_mentions):
            entity_span=new_content[entity.begin_index: entity.end_index]
            new_content=new_content[:entity.begin_index] + '<entity>' + entity_span + '</entity>' + new_content[entity.end_index:]

#        results=ag.disambiguate(new_content)
        results = load_utils.disambiguate_agdistis(new_content)
        dis_entities={}
        for dis_entity in results:
            dis_entities[str(dis_entity['start'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['disambiguatedURL']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            dis_url = dis_entities[str(start)]
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        if c==10000:
            break
            #x='<entity>Barack Obama</entity> arrives in <entity>Washington, D.C.</entity>, and bye <entity>Msdaser</entity>.'
    #len(x)

Article 1 out of 1393
Article 2 out of 1393
Article 3 out of 1393
Article 4 out of 1393
Article 5 out of 1393
Article 6 out of 1393
Article 7 out of 1393
Article 8 out of 1393
Article 9 out of 1393
Article 10 out of 1393
Article 11 out of 1393
Article 12 out of 1393
Article 13 out of 1393
Article 14 out of 1393
Article 15 out of 1393
Article 16 out of 1393
Article 17 out of 1393
Article 18 out of 1393
Article 19 out of 1393
Article 20 out of 1393
Article 21 out of 1393
Article 22 out of 1393
Article 23 out of 1393
Article 24 out of 1393
Article 25 out of 1393
Article 26 out of 1393
Article 27 out of 1393
Article 28 out of 1393
Article 29 out of 1393
Article 30 out of 1393
Article 31 out of 1393
Article 32 out of 1393
Article 33 out of 1393
Article 34 out of 1393
Article 35 out of 1393
Article 36 out of 1393
Article 37 out of 1393
Article 38 out of 1393
Article 39 out of 1393
Article 40 out of 1393
Article 41 out of 1393
Article 42 out of 1393
Article 43 out of 1393
Article 44 out of 13

### Debugging: Check if offsets between SYS and GOLD match for all entities

In [None]:
for articles in n3_articles:
    for entity in article.entity_mentions:
        if not entity.sys_link:
            print(entity.begin_index, entity.sys_link, entity.gold_link)

### Store data on disk

In [None]:
load_utils.store_system_data(aida_title, 'agdistis', aida_articles)
#store_system_data(wes_title, 'agdistis', wes_articles)
load_utils.store_system_data(n3_title, 'agdistis', n3_articles)

### 3) Run DBpedia Spotlight

In [None]:
import requests
import urllib.parse
import xml.etree.cElementTree as ET
from lxml import etree

spotlight_url="http://model.dbpedia-spotlight.org/en/disambiguate"
headers = {'Accept': 'application/json'}

for articles in [aida_articles,n3_articles]:

    c=0
    for article in articles:
        annotation = etree.Element("annotation", text=article.content)
        for mention in article.entity_mentions:
            sf = etree.SubElement(annotation, "surfaceForm")
            sf.set("name", mention.mention)
            sf.set("offset", str(mention.begin_index))
        my_xml=etree.tostring(annotation, xml_declaration=True, encoding='UTF-8')
        results=requests.post(spotlight_url, urllib.parse.urlencode({'text':my_xml, 'confidence': 0.5}), headers=headers)
        j=results.json()
        dis_entities={}
        if 'Resources' in j: resources=j['Resources']
        else: resources=[]
        for dis_entity in resources:
            dis_entities[str(dis_entity['@offset'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['@URI']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        time.sleep(0.1)

In [None]:
load_utils.store_system_data(aida_title, 'spotlight', aida_articles)
#store_system_data(wes_title, 'spotlight', wes_articles)
#store_system_data(rss_title, 'spotlight', rss_articles)
load_utils.store_system_data(n3_title, 'spotlight', n3_articles)

### 4) Run WAT

In [None]:
import json

wat_url='https://wat.d4science.org/wat/tag/json'
#wat_url='http://wikisense.mkapp.it/tag/disambiguate'

for articles in [aida_articles, n3_articles]:
    c=0
    for article in articles:
        txt = article.content
        spans=[]
        for mention in article.entity_mentions:
            span={'start': mention.begin_index,
                 'end': mention.end_index}
            spans.append(span)

        document_json = {
          "text": txt,
          "suggested_spans": spans
        }

        r = requests.get(wat_url, params={"document": json.dumps(document_json)})
        rjson=r.json()
        dis_entities={}
        for dis_entity in rjson['annotations']:
            dis_entities[str(dis_entity['start'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['title']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))

In [None]:
load_utils.store_system_data(n3_title, 'wat', n3_articles)
load_utilsstore_system_data(aida_title, 'wat', aida_articles)
#store_system_data(wes_title, 'wat', wes_articles)