In [None]:
import dataparser
import time
import pickle
import load_utils
from tqdm import tqdm

## 1) Load dataset(s) and show some basic stats

In [None]:
rss_title='rss500'
rss_file='data/N3/RSS-500.ttl'
reuters_title='reuters128'
reuters_file='data/N3/Reuters-128.ttl'

n3_title='n3'

limit=100000000 # for testing purposes

In [None]:
rss_articles=dataparser.load_article_from_nif_file(rss_file, collection=rss_title, limit=limit)

reuters_articles=dataparser.load_article_from_nif_file(reuters_file, collection=reuters_title, limit=limit)

n3_articles=reuters_articles | rss_articles

In [None]:
print('N3: %d articles loaded!' % len(n3_articles))

In [None]:
aida_title='aida'
aida_file='data/AIDA-YAGO2-dataset_topicsLowlevel.tsv'

aida_articles=dataparser.load_article_from_tsv_file(aida_file)

print('AIDA: %d articles loaded!' % len(aida_articles))

In [None]:
num_mentions_n3 = sum(len(article.entity_mentions) for article in n3_articles)
print("N3: %d mentions in total!" % num_mentions_n3)

num_mentions_aida = sum(len(article.entity_mentions) for article in aida_articles)
print("AIDA: %d mentions in total!" % num_mentions_aida)



In [None]:
load_utils.store_dataset(aida_title, aida_articles, anonymize_content=True)
#store_dataset(rss_title, rss_articles)
#store_dataset(reuters_title, reuters_articles)
load_utils.store_dataset(n3_title, n3_articles, anonymize_content=True)

**1.1. Inspect encoding**

In [None]:
for a in rss_articles:
    for entity in a.entity_mentions:
        gold=entity.gold_link
        print(gold)

### 2) AGDISTIS annotation

In [None]:
for articles in [aida_articles, n3_articles]:
    c=0
    for article in tqdm(articles):
        original_content = article.content
        new_content=original_content
        for entity in reversed(article.entity_mentions):
            entity_span=new_content[entity.begin_index: entity.end_index]
            new_content=new_content[:entity.begin_index] + '<entity>' + entity_span + '</entity>' + new_content[entity.end_index:]

#        results=ag.disambiguate(new_content)
        results = load_utils.disambiguate_agdistis(new_content)
        dis_entities={}
        for dis_entity in results:
            dis_entities[str(dis_entity['start'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['disambiguatedURL']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            dis_url = dis_entities[str(start)]
            entity.sys_link = dis_url
        c+=1
        if c==10000:
            break


### Debugging: Check if offsets between SYS and GOLD match for all entities

In [None]:
for articles in n3_articles:
    for entity in article.entity_mentions:
        if not entity.sys_link:
            print(entity.begin_index, entity.sys_link, entity.gold_link)

### Store data on disk

In [None]:
load_utils.store_system_data(aida_title, 'agdistis', aida_articles, anonymize_content=True)
#store_system_data(wes_title, 'agdistis', wes_articles)
load_utils.store_system_data(n3_title, 'agdistis', n3_articles, anonymize_content=True)

### 3) Run DBpedia Spotlight

In [None]:
import requests
import urllib.parse
import xml.etree.cElementTree as ET
from lxml import etree

#spotlight_url="http://model.dbpedia-spotlight.org/en/disambiguate" # February 2018
spotlight_url="http://spotlight.fii800.lod.labs.vu.nl/rest/disambiguate" # April 2016
headers = {'Accept': 'application/json'}

for articles in [aida_articles,n3_articles]:

    c=0
    for article in tqdm(articles):
        annotation = etree.Element("annotation", text=article.content)
        for mention in article.entity_mentions:
            sf = etree.SubElement(annotation, "surfaceForm")
            sf.set("name", mention.mention)
            sf.set("offset", str(mention.begin_index))
        my_xml=etree.tostring(annotation, xml_declaration=True, encoding='UTF-8')
        results=requests.post(spotlight_url, urllib.parse.urlencode({'text':my_xml, 'confidence': 0.5}), headers=headers)
        j=results.json()
        dis_entities={}
        if 'Resources' in j: resources=j['Resources']
        else: resources=[]
        for dis_entity in resources:
            dis_entities[str(dis_entity['@offset'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['@URI']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))
        time.sleep(0.1)

In [None]:
load_utils.store_system_data(aida_title, 'spotlight', aida_articles, anonymize_content=True)
#store_system_data(wes_title, 'spotlight', wes_articles)
#store_system_data(rss_title, 'spotlight', rss_articles)
load_utils.store_system_data(n3_title, 'spotlight', n3_articles, anonymize_content=True)

### 4) Run WAT

In [None]:
import json

wat_url='https://wat.d4science.org/wat/tag/json'
#wat_url='http://wikisense.mkapp.it/tag/disambiguate'

for articles in [aida_articles, n3_articles]:
    c=0
    for article in tqdm(articles):
        txt = article.content
        spans=[]
        for mention in article.entity_mentions:
            span={'start': mention.begin_index,
                 'end': mention.end_index}
            spans.append(span)

        document_json = {
          "text": txt,
          "suggested_spans": spans
        }

        r = requests.get(wat_url, params={"document": json.dumps(document_json)})
        rjson=r.json()
        dis_entities={}
        for dis_entity in rjson['annotations']:
            dis_entities[str(dis_entity['start'])] = load_utils.getLinkRedirect(load_utils.normalizeURL(dis_entity['title']))
        for entity in article.entity_mentions:
            start = entity.begin_index
            if str(start) in dis_entities:
                dis_url = dis_entities[str(start)]
            else:
                dis_url = '--NME--'
            entity.sys_link = dis_url
        c+=1
        print("Article %d out of %d" % (c, len(articles)))

In [None]:
load_utils.store_system_data(n3_title, 'wat', n3_articles, anonymize_content=True)
load_utils.store_system_data(aida_title, 'wat', aida_articles, anonymize_content=True)