In [1]:
import dataparser
import time
import pickle
import load_utils

## 1) Load dataset(s) and show some basic stats

In [2]:
rss_title='rss500'
rss_file='data/N3/RSS-500.ttl'
reuters_title='reuters128'
reuters_file='data/N3/Reuters-128.ttl'

n3_title='n3'

limit=100000000 # for testing purposes

In [3]:
rss_articles=dataparser.load_article_from_nif_file(rss_file, collection=rss_title, limit=limit)

reuters_articles=dataparser.load_article_from_nif_file(reuters_file, collection=reuters_title, limit=limit)

n3_articles=reuters_articles | rss_articles

In [4]:
print('N3: %d articles loaded!' % len(n3_articles))

N3: 628 articles loaded!


In [5]:

wes_title='wes2015'
wes_file="data/wes2015-dataset-nif-1.2.rdf"

wes_articles=dataparser.load_article_from_nif_file(wes_file, limit=limit, collection=wes_title)
print('WES2015: %d articles loaded!' % len(wes_articles))

aida_title='aida'
aida_file='data/AIDA-YAGO2-dataset_topicsLowlevel.tsv'

aida_articles=dataparser.load_article_from_conll_file(aida_file)

print('AIDA: %d articles loaded!' % len(aida_articles))

WES2015: 366 articles loaded!
AIDA: 1393 articles loaded!


In [6]:
num_mentions_n3 = sum(len(article.entity_mentions) for article in n3_articles)
print("N3: %d mentions in total!" % num_mentions_n3)

num_mentions_wes = sum(len(article.entity_mentions) for article in wes_articles)
print("WES2015: %d mentions in total!" % num_mentions_wes)

num_mentions_aida = sum(len(article.entity_mentions) for article in aida_articles)
print("AIDA: %d mentions in total!" % num_mentions_aida)



N3: 1880 mentions in total!
WES2015: 28587 mentions in total!
AIDA: 34929 mentions in total!


In [8]:
load_utils.store_dataset(aida_title, aida_articles)
load_utils.store_dataset(wes_title, wes_articles)
#store_dataset(rss_title, rss_articles)
#store_dataset(reuters_title, reuters_articles)
load_utils.store_dataset(n3_title, n3_articles)

**1.1. Inspect encoding**

In [9]:
for a in rss_articles:
    for entity in a.entity_mentions:
        gold=entity.gold_link
        print(gold)

Bay_Area_Laboratory_Co-operative
Victor_Conte
--NME--
--NME--
Football_Association_of_Ireland
Giovanni_Trapattoni
--NME--
--NME--
--NME--
Dish_Network
--NME--
Portland,_Maine
New_England_Cable_News
--NME--
--NME--
Arab_Community_Center_for_Economic_and_Social_Services
Arthur_Caplan
--NME--
Miami_International_Airport
--NME--
--NME--
--NME--
Eric_Kearney
Jon_A._Husted
Yayoi_Kusama
New_York
--NME--
--NME--
Major_League_Baseball
Rico_Brogna
--NME--
--NME--
Federal_Department_of_Economic_Affairs,_Education_and_Research
Federal_Department_of_Economic_Affairs,_Education_and_Research
--NME--
Richard_H._Bernstein
--NME--
--NME--
--NME--
Nebraska_Cornhuskers_football
Fox
--NME--
--NME--
Insurance_Institute_for_Highway_Safety
--NME--
Roman_Catholic_Archdiocese_of_Portland_in_Oregon
--NME--
Gatineau
San_Francisco–Oakland_Bay_Bridge
--NME--
--NME--
First_Student_(United_States)
Associated_Press
--NME--
Department_for_Education
--NME--
--NME--
Washington,_D.C.
Cherokee_Village,_Arkansas
--NME--
--N

### 2) AGDISTIS annotation

In [10]:
import systemparser

In [12]:

for articles in [aida_articles, n3_articles]:
    c=0
    for article in articles:
        original_content = article.content
        new_content=original_content
        for entity in reversed(article.entity_mentions):
            entity_span=new_content[entity.begin_index: entity.end_index]
            new_content=new_content[:entity.begin_index] + '<entity>' + entity_span + '</entity>' + new_content[entity.end_index:]

#        results=ag.disambiguate(new_content)
        results = systemparser.disambiguateAgdistis(new_content)
        dis_entities={}
        for dis_entity in results:
            dis_entities[str(dis_entity['start'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['disambiguatedURL']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            dis_url = dis_entities[str(start)]
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        if c==10000:
            break
            #x='<entity>Barack Obama</entity> arrives in <entity>Washington, D.C.</entity>, and bye <entity>Msdaser</entity>.'
    #len(x)

Article 1 out of 1393
Article 2 out of 1393
Article 3 out of 1393
Article 4 out of 1393
Article 5 out of 1393
Article 6 out of 1393
Article 7 out of 1393
Article 8 out of 1393
Article 9 out of 1393
Article 10 out of 1393
Article 11 out of 1393
Article 12 out of 1393
Article 13 out of 1393
Article 14 out of 1393
Article 15 out of 1393
Article 16 out of 1393
Article 17 out of 1393
Article 18 out of 1393
Article 19 out of 1393
Article 20 out of 1393
Article 21 out of 1393
Article 22 out of 1393
Article 23 out of 1393
Article 24 out of 1393
Article 25 out of 1393
Article 26 out of 1393
Article 27 out of 1393
Article 28 out of 1393
Article 29 out of 1393
Article 30 out of 1393
Article 31 out of 1393
Article 32 out of 1393
Article 33 out of 1393
Article 34 out of 1393
Article 35 out of 1393
Article 36 out of 1393
Article 37 out of 1393
Article 38 out of 1393
Article 39 out of 1393
Article 40 out of 1393
Article 41 out of 1393
Article 42 out of 1393
Article 43 out of 1393
Article 44 out of 13

Article 347 out of 1393
Article 348 out of 1393
Article 349 out of 1393
Article 350 out of 1393
Article 351 out of 1393
Article 352 out of 1393
Article 353 out of 1393
Article 354 out of 1393
Article 355 out of 1393
Article 356 out of 1393
Article 357 out of 1393
Article 358 out of 1393
Article 359 out of 1393
Article 360 out of 1393
Article 361 out of 1393
Article 362 out of 1393
Article 363 out of 1393
Article 364 out of 1393
Article 365 out of 1393
Article 366 out of 1393
Article 367 out of 1393
Article 368 out of 1393
Article 369 out of 1393
Article 370 out of 1393
Article 371 out of 1393
Article 372 out of 1393
Article 373 out of 1393
Article 374 out of 1393
Article 375 out of 1393
Article 376 out of 1393
Article 377 out of 1393
Article 378 out of 1393
Article 379 out of 1393
Article 380 out of 1393
Article 381 out of 1393
Article 382 out of 1393
Article 383 out of 1393
Article 384 out of 1393
Article 385 out of 1393
Article 386 out of 1393
Article 387 out of 1393
Article 388 out 

Article 689 out of 1393
Article 690 out of 1393
Article 691 out of 1393
Article 692 out of 1393
Article 693 out of 1393
Article 694 out of 1393
Article 695 out of 1393
Article 696 out of 1393
Article 697 out of 1393
Article 698 out of 1393
Article 699 out of 1393
Article 700 out of 1393
Article 701 out of 1393
Article 702 out of 1393
Article 703 out of 1393
Article 704 out of 1393
Article 705 out of 1393
Article 706 out of 1393
Article 707 out of 1393
Article 708 out of 1393
Article 709 out of 1393
Article 710 out of 1393
Article 711 out of 1393
Article 712 out of 1393
Article 713 out of 1393
Article 714 out of 1393
Article 715 out of 1393
Article 716 out of 1393
Article 717 out of 1393
Article 718 out of 1393
Article 719 out of 1393
Article 720 out of 1393
Article 721 out of 1393
Article 722 out of 1393
Article 723 out of 1393
Article 724 out of 1393
Article 725 out of 1393
Article 726 out of 1393
Article 727 out of 1393
Article 728 out of 1393
Article 729 out of 1393
Article 730 out 

Article 1030 out of 1393
Article 1031 out of 1393
Article 1032 out of 1393
Article 1033 out of 1393
Article 1034 out of 1393
Article 1035 out of 1393
Article 1036 out of 1393
Article 1037 out of 1393
Article 1038 out of 1393
Article 1039 out of 1393
Article 1040 out of 1393
Article 1041 out of 1393
Article 1042 out of 1393
Article 1043 out of 1393
Article 1044 out of 1393
Article 1045 out of 1393
Article 1046 out of 1393
Article 1047 out of 1393
Article 1048 out of 1393
Article 1049 out of 1393
Article 1050 out of 1393
Article 1051 out of 1393
Article 1052 out of 1393
Article 1053 out of 1393
Article 1054 out of 1393
Article 1055 out of 1393
Article 1056 out of 1393
Article 1057 out of 1393
Article 1058 out of 1393
Article 1059 out of 1393
Article 1060 out of 1393
Article 1061 out of 1393
Article 1062 out of 1393
Article 1063 out of 1393
Article 1064 out of 1393
Article 1065 out of 1393
Article 1066 out of 1393
Article 1067 out of 1393
Article 1068 out of 1393
Article 1069 out of 1393


KeyboardInterrupt: 

### Debugging: Check if offsets between SYS and GOLD match for all entities

In [None]:
for articles in n3_articles:
    for entity in article.entity_mentions:
        if not entity.sys_link:
            print(entity.begin_index, entity.sys_link, entity.gold_link)

### Store data on disk

In [None]:
load_utils.store_system_data(aida_title, 'agdistis', aida_articles)
#store_system_data(wes_title, 'agdistis', wes_articles)
load_utils.store_system_data(n3_title, 'agdistis', n3_articles)

### 3) Run DBpedia Spotlight

In [13]:
import requests
import urllib.parse
import xml.etree.cElementTree as ET
from lxml import etree

spotlight_url="http://model.dbpedia-spotlight.org/en/disambiguate"
headers = {'Accept': 'application/json'}

for articles in [aida_articles,n3_articles]:

    c=0
    for article in articles:
        annotation = etree.Element("annotation", text=article.content)
        for mention in article.entity_mentions:
            sf = etree.SubElement(annotation, "surfaceForm")
            sf.set("name", mention.mention)
            sf.set("offset", str(mention.begin_index))
        my_xml=etree.tostring(annotation, xml_declaration=True, encoding='UTF-8')
        results=requests.post(spotlight_url, urllib.parse.urlencode({'text':my_xml, 'confidence': 0.5}), headers=headers)
        j=results.json()
        dis_entities={}
        if 'Resources' in j: resources=j['Resources']
        else: resources=[]
        for dis_entity in resources:
            dis_entities[str(dis_entity['@offset'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['@URI']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        time.sleep(0.1)

Article 1 out of 1393
Article 2 out of 1393
Article 3 out of 1393
Article 4 out of 1393
Article 5 out of 1393
Article 6 out of 1393
Article 7 out of 1393
Article 8 out of 1393
Article 9 out of 1393


KeyboardInterrupt: 

In [None]:
load_utils.store_system_data(aida_title, 'spotlight', aida_articles)
#store_system_data(wes_title, 'spotlight', wes_articles)
#store_system_data(rss_title, 'spotlight', rss_articles)
load_utils.store_system_data(n3_title, 'spotlight', n3_articles)

### 4) Run WAT

In [14]:
import json

wat_url='https://wat.d4science.org/wat/tag/json'
#wat_url='http://wikisense.mkapp.it/tag/disambiguate'

for articles in [aida_articles, n3_articles]:
    c=0
    for article in articles:
        txt = article.content
        spans=[]
        for mention in article.entity_mentions:
            span={'start': mention.begin_index,
                 'end': mention.end_index}
            spans.append(span)

        document_json = {
          "text": txt,
          "suggested_spans": spans
        }

        r = requests.get(wat_url, params={"document": json.dumps(document_json)})
        rjson=r.json()
        dis_entities={}
        for dis_entity in rjson['annotations']:
            dis_entities[str(dis_entity['start'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['title']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))

Article 1 out of 1393
Article 2 out of 1393
Article 3 out of 1393
Article 4 out of 1393
Article 5 out of 1393
Article 6 out of 1393


KeyboardInterrupt: 

In [None]:
load_utils.store_system_data(n3_title, 'wat', n3_articles)
load_utilsstore_system_data(aida_title, 'wat', aida_articles)
#store_system_data(wes_title, 'wat', wes_articles)