In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:95% !important; }</style>"))

## TagMe: Entity Linking on the Fly 

## Entity Linking

The task of recognizing and disambiguating named entities to a knowledge base.

<center><img src="Figures/iowa.png" height="50%" width="50%" align="center"></center>

## TagMe

TagMe (Ferragina and Scaiella, CIKM 2010): language independent annotations of short texts.

A must-have end-to-end baseline in Entity Linking. 

## Why I like TagMe

- It answers to a pressing need in DH and CSS.

- It adds semantics without dealing (directly) with it.

- It is language independent.

- It is very intuitive.

## 1. Mention Identification via Link Probability

In [2]:
import nltk

sentence = "Obama won the 2008 Iowa caucuses while Hillary was leading in the polls."
ngram_up_to = 3

def get_all_ngrams(text,ngram_up_to):
    """Returns all ngrams from a text up to a certain number.
    
    Args:
        text: a string.
        ngram_up_to: a integer.
    Returns:
        A list of ngrams.
    """
    
    tokens = nltk.word_tokenize(text)
    ngrams = [" ".join(x) for n in range(1,ngram_up_to+1) for x in nltk.ngrams(tokens,n)]        
    return ngrams
 
ngrams = get_all_ngrams(sentence,ngram_up_to)
print (ngrams[:20]) 

['Obama', 'won', 'the', '2008', 'Iowa', 'caucuses', 'while', 'Hillary', 'was', 'leading', 'in', 'the', 'polls', '.', 'Obama won', 'won the', 'the 2008', '2008 Iowa', 'Iowa caucuses', 'caucuses while']


## Resources

<center><img src="Figures/wiki-size.png" height="50%" width="50%" align="center"></center>
<center><img src="Figures/size-gb.png" height="30%" width="30%" align="center"></center>

## Resources

In [3]:
import pickle

with open("Resources/overall_mentions_freq.pickle", "rb") as f:
    mentions_freq = pickle.load(f)

with open("Resources/overall_ngrams_freq_cleaned.pickle", "rb") as f:
    ngrams_freq = pickle.load(f)
        
with open("Resources/overall_entity_freq.pickle", "rb") as f:
    entity_freq = pickle.load(f)
        
with open("Resources/mention_overall_dict.pickle", "rb") as f:
    mention_to_entities = pickle.load(f)

with open("Resources/entity_overall_dict.pickle", "rb") as f:
    entity_inlinks = pickle.load(f)
    
print ("all ready!")

all ready!


In [4]:
entity_inlinks["Barack%20Obama"][:4]

['Abraham%20Lincoln', 'Alaska', 'Apollo%2011', 'The%20Amazing%20Spider-Man']

In [5]:
def get_link_proba(ngram):
    """Returns probability of a ngram to be an entity mention.
    
    Args:
        ngram: a string.
    Returns:
        A probability score.
        If a ngram is missing from the mentions_freq or the ngrams_freq dictionaries,
        then it returns None.       
    """
    
    global mentions_freq, ngrams_freq
    
    try:
        mention_freq = mentions_freq[ngram]
        ngram_freq = ngrams_freq[ngram]
    except KeyError:
        return None
    else:
        return mention_freq / ngram_freq        

get_link_proba("Obama")

0.007931134539123707

In [6]:
candidate_mentions = [(ngram, get_link_proba(ngram)) for ngram in ngrams if ngram in ngrams_freq]

print (candidate_mentions)

[('Obama', 0.007931134539123707), ('won', 0.0005864751645721125), ('the', 3.886926584799571e-07), ('2008', 0.009146341463414634), ('Iowa', 0.16056687851559645), ('caucuses', 0.029350104821802937), ('while', 3.0125018828136766e-06), ('Hillary', 0.012371862849063274), ('was', 8.64148583781452e-07), ('leading', 0.0003902267091300462), ('in', 2.4781263311631732e-06), ('the', 3.886926584799571e-07), ('polls', 0.005082005082005082), ('.', 2.4849563271031407e-06), ('won the', 1.5227885303567893e-05), ('Iowa caucuses', 0.41025641025641024)]


In [7]:
def remove_substrings(candidate_mentions):
    """Removes very short candidate mentions (<2 chars) 
    and ngrams that are substrings of a more popular one.
    
    Args:
        candidate_mentions: list of tuples (candidate_mention (str), probability score (float)).
    Returns:
        A final list of mentions as strings.      
    """
    
    candidate_mentions = [(cand,score) for cand,score in candidate_mentions if len(cand)>1]
        
    candidate_mentions.sort(key=lambda cand: cand[1], reverse=True)
    
    mentions = []
    for cand,score in candidate_mentions:
        if not any([cand in mention for mention in mentions]):
            mentions.append(cand)
    return mentions
 
mentions = remove_substrings(candidate_mentions)

print (mentions)

['Iowa caucuses', 'Hillary', '2008', 'Obama', 'polls', 'won', 'leading', 'won the', 'while', 'was']


In [8]:
from scripts import output_helpers

output_helpers.show_top_entity_candidates("Hillary",mention_to_entities)

[['/wiki/Hillary_Clinton', 29],
 ['/wiki/Edmund_Hillary', 5],
 ['/wiki/Hilary_(name)', 1]]

## 2. Disambiguation via Commonness and Relatedness

<img src="Figures/tagme-alg.png" height="100%" width="100%" align="center">

In [9]:
def get_commoness(mention,entity_candidate):
    """Compute how common it is that a given mention would point to a specific entity.
    
    Args:
        mention: a string, entry in the global dictionary mentions_freq.
        entity_candidate: a string, entry in the global dictionary mention_to_entities.
    Returns:
        A commoness score (between 0.0 and 1.0)      
    """
    
    global mention_to_entities, mentions_freq
    
    entity_mention_freq = mention_to_entities[mention][entity_candidate]
    mention_freq = mentions_freq[mention]
    return entity_mention_freq / mention_freq

mention = "Hillary"
candidates = mention_to_entities[mention]

for cand in candidates:
    commoness = get_commoness(mention,cand)
    print (cand, commoness)

Hillary%20Clinton 0.8285714285714286
Edmund%20Hillary 0.14285714285714285
Hilary%20%28name%29 0.02857142857142857


In [10]:
import math
n_all_entities = len(entity_inlinks)

def get_relatedness(e1, e2):
    """Compute Witten & Milne (AAAI, 2008) relatedness between two entities.
    
    Args:
        e1,e2: two entities.
    Returns:
        A relatedness score (between 0.0 and 1.0)      
    """
    
    global n_all_entities, entity_inlinks
        
    entities = (e1, e2)
    entities_in_links = [set(entity_inlinks[entity]) for entity in entities]
    n_ent_in_links = [len(entity) for entity in entities_in_links]
        
    conjunct_in_links = len([in_link for in_link in entities_in_links[0] if in_link in entities_in_links[1]])
        
    if conjunct_in_links==0:
        return 0.0
    
    num = math.log(max(n_ent_in_links)) - math.log(conjunct_in_links)
    den = math.log(n_all_entities) - math.log(min(n_ent_in_links))
    rel = 1 - (num / den)
    if rel == 0:
        return 0.0
    return rel
            
print (get_relatedness("Barack%20Obama","Hillary%20Clinton"))
print (get_relatedness("Barack%20Obama","Edmund%20Hillary"))

0.788401339278572
0.3054468828291247


In [11]:
def get_relevance_score(entity,other_mentions):
    """Compute relevance score for an entity given surrounding mentions
    
    Args:
        entity: the entity under study.
        other_mentions: list of tuples (mention,[candidate entities])
    Returns:
        A relevance score (between 0.0 and 1.0)      
    """
    
    relevance_score = 0.0
        
    for other_mention, other_cands in other_mentions:
            
        vote = 0.0
            
        for other_cand in other_cands:
            rel = get_relatedness(entity,other_cand)
            comm = get_commoness(other_mention,other_cand)
            vote+= comm * rel

        avg_vote = vote/len(other_cands)
        relevance_score+=avg_vote
    return relevance_score

mention = "Hillary"
                
candidates = mention_to_entities[mention]

other_mentions = [(other_m, mention_to_entities[other_m]) for other_m in mentions if other_m != mention]

get_relevance_score("Hillary%20Clinton",other_mentions)

0.22436376077998585

In [12]:
def get_best_match(mention,candidates,other_mentions,thr= 0.03):
    """Retrieve best candidate match, given a mention in context 
    
    Args:
        mention: a mention of an entity.
        candidates: a list of entity candidates for the mention.
        other_mentions: list of tuples (mention,[list of candidate entities for that mention])
        thr: optional variable, pruning for very uncommon candidate-mention pairs (default is 0.02)

    Returns:
        The best entity candidate for the mention      
    """    
    candidates = [cand for cand in candidates if get_commoness(mention,cand)>thr]
    
    cand_scores = []
    
    for cand in candidates:
        relevance_score = get_relevance_score(cand,other_mentions)
        cand_scores.append([cand,relevance_score])
    
    cand_scores.sort(key=lambda x: x[1],reverse=True)
    best_entity = cand_scores[0][0]
    return (mention, best_entity)

get_best_match(mention,candidates,other_mentions)

('Hillary', 'Hillary%20Clinton')

<img src="Figures/hillary.png" height="30%" width="30%" align="center">

<img src="Figures/hillary_aspects.png" height="100%" width="100%" align="center">

F. Nanni, S.P. Ponzetto and L. Dietz, "Entity-Aspect Linking", JCDL 2018, "Vannevar Bush" Best Paper Award


In [13]:
import json
from scripts import EAL
from IPython.core.display import display, HTML

mention , entity = get_best_match(mention,candidates,other_mentions)

with open("Aspects/"+entity+".json") as json_file:   
    aspects = json.load(json_file)
    aspect = EAL.rank(sentence,aspects)[0][0]
    output = output_helpers.generate_wikilink(sentence,mention,entity,aspect)
    display(HTML(output))

