### Spacy NER + Wikidata entity finder API

Adopted from a notebook on https://github.com/gossebouma/spacy-wikidata-ner

__TODO__: filter results, ie if something is a LOC or GPE according to Spacy, do not return the link for the family name (as it does now for Engeland, Brantes, etc) But on the other hand, NEC is also wrong in many cases, so rigorous filtering might not work either. 

see https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities
see https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities to get info on a specific QID, like instance or description

for instance, with wbgetentities require PERSON qids to be human, ie P31=Q5 (works for Ruud Lubbers, Haary Mulisch, Frits Zernike, Sigrid Kaag), but Microsoft is a software company, business, technology company, public company, Amnesty International is a nongvernamental, nonprofit organisation, nonprofit company, Volkskrant a daily newspaper, Winsum is a populated place, Gemeente Winsum a municipality in the Netherlands, etc. But on the other hand, there are also attributes like inception (for ORG) or located in (for LOC) that might help.

this has a pretty detailed example : https://medium.com/@dreamai/linking-extracted-entities-to-wikidata-why-and-how-168eacb4fb87
but uses cosine similarity and some other tricks that seem less useful 


In [None]:
import spacy

nlp = spacy.load('nl_core_news_lg')

from spacy.tokens import Span

import requests

# is it OK to use this globally for a whole session? 
# yes because disambiguation does not take context or history into account
# no if you fiddle with disambiguation strategies 
cached_spans = {}

def wikidata_entity_link(span) :
    try :
        link = cached_spans[span.text]
    except KeyError :
        link = wikidata_entity_search(span.text)
    return link

Span.set_extension('wikidata_id',getter=wikidata_entity_link)    

In [None]:
def wikidata_entity_search(text) :
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action':'wbsearchentities', 
              'language':'nl',
              'format':'json',
              'search': text}
    json = requests.get(url,params).json()
    # this part can be replaced by fancier disambiguation methods, or returning a list of ids from all search results
    return select_entity(json['search'])

def select_entity(results) :
    try :
        qid = results[0]['id']
        if familyname(qid) :
            qid = results[1]['id']
        else :
            True      
    except :
        qid = 'no_qid_found'
    return qid

def wikidata_entity_info(qid) :
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action':'wbgetentities', 
              'sites': 'wikidatawiki',
              'format':'json',
              'ids': qid,
               'props':'claims'}
    json = requests.get(url,params).json()
    return json
            
def familyname(qid) :
    info = wikidata_entity_info(qid)
    fam_name = 0
    try :
        for instance in info['entities'][qid]['claims']['P31'] : #instance 
            if instance['mainsnak']['datavalue']['value']['id'] == 'Q101352' : #familyname
                fam_name = 1
    except :
        print('no P31')
    return fam_name 


def extract_entities(parse) :   # note that wikidata_entity_link function is called every time we access wikidata_id
    # so better to create a copy or results 
    entities = []
    for ent in parse.ents :
        entities.append((ent.text,ent.label_,ent._.wikidata_id))
    return(entities)

In [None]:
doc = nlp('Satellieten zoals de Astra gebruiken elke frequentie twee keer.')

entities = extract_entities(doc)
for ent in entities:
    print(ent)


### Annotate and Score

General functions for annotation with Spacy and for reporting scores. Spacy NE annotation can be filtered using a given list of NECLASSES as used by Spacy. score is a dict with system, gold and overlap as keys and numeric values. 

In [None]:
def system_annotation(entities,NECLASSES) :
    print(entities)
    system = set() # do we ever have multiple identical entities, if so, set functions should not be used?
    for ent in entities :  # one case is not-found errors, should they be unique? notfound+string? 
                           # just ignore no_qid_found completely : not a link so never contributes to P or R 
        if ent[1] in NECLASSES and ent[2] != 'no_qid_found' : 
            system.add(ent[2])
    return system 

def update_score(system,gold,score) :
    overlap = system & gold
    score['overlap'] += len(overlap)
    score['system'] += len(system)
    score['gold'] += len(gold)

def print_score(score) :
    try :
        precision = score['overlap']/score['system']
    except :
        precision = 0
    try :
        recall = score['overlap']/score['gold']
    except :
        recall = 0
    try :
        fscore = 2 * ((precision * recall) / (precision + recall))
    except :
        fscore = 0
    print('precision:{:.3f}, recall:{:.3f}, fscore:{:.3f}'.format(precision,recall,fscore))

def print_spacy_qids(qids,ents) :
    for ent in ents :
        if  ent[2] in qids :
            print(ent)
            qids.discard(ent[2])
    print("")    

### Evaluation on multiNERD

In [None]:
import csv

def filterNERD(tag) :
    try :
        neclass = tag.split('-')[1]
    except :
        neclass = 'MISSING'
    if neclass not in ['ANIM', 'FOOD', 'DIS', 'PLANT' ] :
        return True
    else :
        return False

def filterSpacy(tags) :
    filtered = set()
    for tag in tags :
        neclass = tag.split('/')[0]
        if neclass in ['EVENT','GPE','LOC','ORG','PERSON','DATE','TIME', 'WORK_OF_ART', 'PRODUCT' ] :
            filtered.add(tag)
    return filtered 
    # return tags  # no filtering 
    
with open("multiNERD/dev2000.tsv") as nd:
    nerd = csv.reader(nd, delimiter="\t", quoting=csv.QUOTE_NONE)
    sentence = []
    annotation = set()
    score = {'overlap':0, 'system' : 0, 'gold' : 0}
    for row in nerd:
        if len(row) >= 3 :
            sentence.append(row[1])
            if len(row) > 4 :
                if row[4] and filterNERD(row[2]):
                    annotation.add(row[2]+'/'+row[4])
        else :
            string = ' '.join(sentence)
            string = string.replace(' - ','-') # Engels - Nederlandse 
            string = string.replace(' e ','e ') # 19 e eeuw
            print(string)
            doc = nlp(string)
            systemNE = {ent.label_ + '/' + ent._.wikidata_id for ent in doc.ents}
            system = filterSpacy(systemNE)
            print(systemNE,system,annotation)
            update_score( {result.split('/')[1] for result in system}, {result.split('/')[1] for result in annotation}, score)
            sentence=[]
            annotation = set()
    print_score(score)

over first 1000 lines prec and recall is 0.59 if we apply filtering.
including date/time R 0.61 P 0.52 

latest version (1000 lines): precision:0.514, recall:0.632
over 2000 lines:             precision:0.517, recall:0.602

### Evaluate on WiNNL

In [None]:
import pandas

winnl = pandas.read_json('WiNNL/dutch_winnl_data.json')


In [None]:
def winnl_annotations(row) :
    gold = set()
    for token in row['qid'] :
        if token.startswith('B-') :
            qid = token.replace('B-','')
            gold.add(qid)
    return gold 

def evaluate_winnl_items(First,Last) :
    score = {'overlap':0, 'system' : 0, 'gold' : 0}
    for Id in list(range(First,Last)) :
        row = winnl.loc[Id]
        gold = winnl_annotations(row)
        if gold and 'Q404' not in gold : 
            ## old: ['EVENT','GPE','LOC','ORG','PERSON','DATE','TIME']
            system = annotate_text(row['original'], ['LOC','GPE','ORG','PERSON','EVENT'] )
            print(system,gold)
            update_score(system,gold,score)
    print(system & gold)
    print(system.difference(system & gold))
    print(gold.difference(system & gold))
    print_score(score)

In [None]:
evaluate_winnl_items(4225,4235)

recall:0.445, precision:0.430

after strict filtering (loc/gpe/per/org only, and ignore data with 404 or no NE): recall:0.495, precision:0.589


In [None]:
def evaluate_winnl_article(url) : # solve the Obama/Barack Obama issue, ie ensure they point to same QID
    entities = {}
    score = {'overlap':0, 'system' : 0, 'gold' : 0}
    for index, row in winnl.loc[winnl['url'] == url].iterrows() :
        entities[index] = find_entities(row['original'])
        gold[index] = winnl_annotations(row)
    for index in entities :
        if gold[index] and 'Q404' not in gold[index] :
            system = resolve_entities(index,entities)
            update_score(system,gold,score)
        

### Evaluate on damuel

In [None]:
import pandas

damuel = pandas.read_json('damuel_1.0_nl/part-00000', lines=True)

import pickle

nec_types = pickle.load(open('damuel_1.0_nl/damuel_1.0_wikidata/all_nec_dict.p', "rb"))


In [None]:
def damuel_annotation(wiki) :
    links = set()
    for link in wiki['links'] :
        try :
            links.add(link['qid'])
        except :
            True
    annotation = set()
    for qid in sorted(links) :
        try :
            NEC = nec_types[qid]
            print(qid,NEC)
            annotation.add(qid)
        except :
            True
    return annotation

def damuel_annotation_old(wiki) :
    annotation = set()
    for link in wiki['links'] :
        start = link['start']
        end  = link['end']
        upostags = []
        string = []
        propn = 0
        for token in wiki['tokens'][start:end] : 
            upostags.append(token['upostag'])
            string.append(token['lemma'])
            if token['upostag'] == 'PROPN' :
                propn = 1
        try :
            qid = link['qid']
        except :
            qid = 'missing'
        if propn :
            annotation.add(qid)
    return annotation

def print_damuel_annotation(qids,wiki) :
    for link in wiki['links'] :
        try :
            link_id = link['qid']
        except :
            link_id = 'missing'
        if link_id in qids:
            start = link['start']
            end  = link['end']
            upostags = []
            string = []
            for token in wiki['tokens'][start:end] : 
                upostags.append(token['upostag'])
                string.append(token['lemma'])
            print(link_id, string, upostags, link['title'])
            qids.discard(link_id)
        

In [None]:
wiki = damuel.loc[95]['wiki']

#text = nlp(wiki['text']) 
#for ent in text.ents :
#    print(ent.label_) 

parse = nlp(wiki['text'])
entities = extract_entities(parse)
system = system_annotation(entities,['LOC','GPE','ORG','PERSON','EVENT','WORK_OF_ART']) 

gold = damuel_annotation(wiki)

score = {'overlap':0, 'system' : 0, 'gold' : 0}
update_score(system,gold,score)

#print(system,damuel_annotation(wiki)) 
print_score(score)
print("overlap:")
print_spacy_qids(system & gold,entities)
print("system only:")
print_spacy_qids(system.difference(system & gold),entities)
print("annotation only:")
print_damuel_annotation(gold.difference(system & gold),wiki)


In [None]:
def evaluate_damuel(First,Last) :
    score = {'overlap':0, 'system' : 0, 'gold' : 0}
    for Id in list(range(First,Last)) :
        wiki = damuel.loc[Id]['wiki']
        try : 
            parse = nlp(wiki['text'])
            entities = extract_entities(parse)
            system = system_annotation(entities,['LOC','GPE','ORG','PERSON','EVENT','WORK_OF_ART','PRODUCT']) 
            gold = damuel_annotation(wiki)
            update_score(system,gold,score)
        except :
            True
    print_score(score)

In [None]:
evaluate_damuel(0,100)