In [1]:
import dataparser
import time
import pickle
import load_utils

## 1) Load dataset(s) and show some basic stats

In [2]:
rss_title='rss500'
rss_file='data/N3/RSS-500.ttl'
reuters_title='reuters128'
reuters_file='data/N3/Reuters-128.ttl'

n3_title='n3'

limit=100000000 # for testing purposes

In [3]:
rss_articles=dataparser.load_article_from_nif_file(rss_file, collection=rss_title, limit=limit)

reuters_articles=dataparser.load_article_from_nif_file(reuters_file, collection=reuters_title, limit=limit)

n3_articles=reuters_articles | rss_articles

In [4]:
print('N3: %d articles loaded!' % len(n3_articles))

N3: 628 articles loaded!


In [None]:

wes_title='wes2015'
wes_file="data/wes2015-dataset-nif-1.2.rdf"

wes_articles=dataparser.load_article_from_nif_file(wes_file, limit=limit, collection=wes_title)
print('WES2015: %d articles loaded!' % len(wes_articles))

aida_title='aida'
aida_file='data/AIDA-YAGO2-dataset_topicsLowlevel.tsv'

aida_articles=dataparser.load_article_from_conll_file(aida_file)

print('AIDA: %d articles loaded!' % len(aida_articles))

In [None]:
num_mentions_n3 = sum(len(article.entity_mentions) for article in n3_articles)
print("N3: %d mentions in total!" % num_mentions_n3)

num_mentions_wes = sum(len(article.entity_mentions) for article in wes_articles)
print("WES2015: %d mentions in total!" % num_mentions_wes)

num_mentions_aida = sum(len(article.entity_mentions) for article in aida_articles)
print("AIDA: %d mentions in total!" % num_mentions_aida)



**1.1. Inspect encoding**

In [None]:
for a in rss_articles:
    for entity in a.entity_mentions:
        gold=entity.gold_link
        print(gold)

In [None]:
def store_dataset(title, articles):    
    with open('%s.bin' % title, 'wb') as outfile:
        pickle.dump(articles, outfile)
        
def store_system_data(dataset, system, articles):    
    with open('%s_%s.bin' % (dataset, system), 'wb') as outfile:
        pickle.dump(articles, outfile)

In [None]:
store_dataset(aida_title, aida_articles)
store_dataset(wes_title, wes_articles)

In [None]:
#store_dataset(rss_title, rss_articles)
#store_dataset(reuters_title, reuters_articles)
store_dataset(n3_title, n3_articles)

### 2) AGDISTIS annotation

In [None]:
import systemparser

In [None]:
#from agdistispy.agdistis import Agdistis
#ag = Agdistis()

#for articles in [aida_articles]:
for articles in [n3_articles]:
    c=0
    for article in articles:
        original_content = article.content
        new_content=original_content
        for entity in reversed(article.entity_mentions):
            entity_span=new_content[entity.begin_index: entity.end_index]
            new_content=new_content[:entity.begin_index] + '<entity>' + entity_span + '</entity>' + new_content[entity.end_index:]

#        results=ag.disambiguate(new_content)
        results = systemparser.disambiguateAgdistis(new_content)
        dis_entities={}
        for dis_entity in results:
            dis_entities[str(dis_entity['start'])] = utils.getLinkRedirect(utils.normalizeURL(dis_entity['disambiguatedURL']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            dis_url = dis_entities[str(start)]
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        if c==10000:
            break
            #x='<entity>Barack Obama</entity> arrives in <entity>Washington, D.C.</entity>, and bye <entity>Msdaser</entity>.'
    #len(x)

### Debugging: Check if offsets between SYS and GOLD match for all entities

In [None]:
for articles in n3_articles:
    for entity in article.entity_mentions:
        if not entity.sys_link:
            print(entity.begin_index, entity.sys_link, entity.gold_link)

### Store data on disk

In [None]:
#store_system_data(aida_title, 'agdistis', aida_articles)
#store_system_data(wes_title, 'agdistis', wes_articles)
store_system_data(n3_title, 'agdistis', n3_articles)

### 3) Run DBpedia Spotlight

In [None]:
import requests
import urllib.parse
import xml.etree.cElementTree as ET
from lxml import etree
import time

spotlight_url="http://model.dbpedia-spotlight.org/en/disambiguate"
headers = {'Accept': 'application/json'}

for articles in [n3_articles]:
#for articles in [aida_articles,wes_articles]:

    c=0
    for article in articles:
        annotation = etree.Element("annotation", text=article.content)
        for mention in article.entity_mentions:
            sf = etree.SubElement(annotation, "surfaceForm")
            sf.set("name", mention.mention)
            sf.set("offset", str(mention.begin_index))
        my_xml=etree.tostring(annotation, xml_declaration=True, encoding='UTF-8')
        results=requests.post(spotlight_url, urllib.parse.urlencode({'text':my_xml, 'confidence': 0.5}), headers=headers)
        j=results.json()
        dis_entities={}
        if 'Resources' in j: resources=j['Resources']
        else: resources=[]
        for dis_entity in resources:
            dis_entities[str(dis_entity['@offset'])] = utils.getLinkRedirect(utils.normalizeURL(dis_entity['@URI']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        time.sleep(0.1)

In [None]:
#store_system_data(aida_title, 'spotlight', aida_articles)
#store_system_data(wes_title, 'spotlight', wes_articles)
#store_system_data(rss_title, 'spotlight', rss_articles)
store_system_data(n3_title, 'spotlight', n3_articles)

### 4) Run WAT

In [None]:
import json

wat_url='https://wat.d4science.org/wat/tag/json'
#wat_url='http://wikisense.mkapp.it/tag/disambiguate'

for articles in [n3_articles]:
    c=0
    for article in articles:
        txt = article.content
        spans=[]
        for mention in article.entity_mentions:
            span={'start': mention.begin_index,
                 'end': mention.end_index}
            spans.append(span)

        document_json = {
          "text": txt,
          "suggested_spans": spans
        }

        r = requests.get(wat_url, params={"document": json.dumps(document_json)})
        rjson=r.json()
        dis_entities={}
        for dis_entity in rjson['annotations']:
            dis_entities[str(dis_entity['start'])] = utils.getLinkRedirect(utils.normalizeURL(dis_entity['title']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))

In [None]:
store_system_data(n3_title, 'wat', n3_articles)
#store_system_data(aida_title, 'wat', aida_articles)
#store_system_data(wes_title, 'wat', wes_articles)