In [1]:
import sys
sys.path.append('./src/')

In [2]:
import sqlite3

In [3]:
import nltk

In [4]:
import random
import numpy as np

In [5]:
from collections import defaultdict

In [6]:
from torchlite.torch.train_callbacks import ModelSaverCallback

from model.embedding import ModelVectorizer, OnDiskVectorizer
from model.arc2 import ARC2, PreConv

In [7]:
def tokenizer(text, alpha_only=True):  # create a tokenizer function
    words = [tok for tok in nltk.word_tokenize(text) if (not alpha_only or tok.isalpha())]
    return words

In [8]:
class NEL:
    def __init__(self):
        net_params = {
            'preconv': True,
            'word_emb_sizes': [300],
            'preconv_size': [300],
            'matrix_depth': [120],
            'conv_depth': [120, 60, 60],
            'out_size': [60]
        }
        self.vectorizer = OnDiskVectorizer(mtx_path='./data/fastText.nmy', meta_path='./data/fastText.json')
        preconv = PreConv(
            word_emb_sizes=net_params['word_emb_sizes'],
            sent_conv_size=net_params['preconv_size'],
            dropout=0.0,
            window=2
        )

        self.model = ARC2(
            vectorizer=None,
            preconv=preconv,
            matrix_depth=net_params['matrix_depth'],
            conv_depth=net_params['conv_depth'],
            out_size=net_params['out_size'],
            window=2,
            dropout=0.0
        )
        
        ModelSaverCallback.restore_model_from_file(self.model, './data/models/ARC2_best.pth', load_with_cpu=True)
        self.model = self.model.eval()
        
        self.connection = sqlite3.connect('./data/mentions.sqlite3')
        self.cur = self.connection.cursor()
        
    def match_sentences(self, sent_a, sent_b):
        sent_a_token = tokenizer(sent_a) + [' '] * 6  # ensure data size > conv kernel size
        sent_b_token = tokenizer(sent_b) + [' '] * 6
        
        sent_a_vect = self.vectorizer.convert([sent_a_token])
        sent_b_vect = self.vectorizer.convert([sent_b_token])
        
        score = self.model.forward(sent_a_vect, sent_b_vect)

        return score[0][1].item()
    
    
    def create_reference_set(self, mention):
        self.cur.execute("select * from mentions where mention match ? limit 1000", (mention, ))
        
        result = {}
        
        entities = defaultdict(list)
        for entity, left_context, mention, right_context in nel.cur.fetchall():
            entities[entity].append((left_context, mention, right_context))
        for key, val in entities.items():
            if len(val) > 2:
                result[key] = list(map(lambda x: x[0] + " XXXXX " + x[2], random.sample(val, min(10, len(val)))))
        
        return result
    
    
    def match_all(self, left_context, mention, right_context, sentences):
        ref_sent = left_context + " XXXXX " + right_context
        all_scores = []
        for sent in sentences:
            score = self.match_sentences(ref_sent, sent)
            all_scores.append(score)
            
        return np.mean(sorted(all_scores, reverse=True)[:3])
    
    def disabiguate(self, left_context, mention, right_context):
        train = nel.create_reference_set(mention)
        
        for entity, sents in train.items():
            score = self.match_all(left_context, mention, right_context, sents)
            print(entity, score)
        

In [9]:
%%time
nel = NEL()


--- Model restored ---

CPU times: user 2.91 s, sys: 432 ms, total: 3.34 s
Wall time: 3.34 s


In [12]:
left_context, mention, right_context = (
    "American socialite",
    "Paris",
    "arrived in Turkish Cyprus on Aug. 4 for an appearance to promote her new perfume."
)

In [13]:
nel.disabiguate(left_context, mention, right_context)

Paris_Peace_Treaties,_1947 0.229919329286
Paris 0.364550327261
Paris_Peace_Conference 0.922771235307
Paris–Nice 0.388005521148
fr:Paris 0.198767289519
Paris_Hilton 0.987708667914
Catacombs_of_Paris 0.0474469698966
Île-de-France 0.0764339289938
Paris,_Ontario 0.193740231295
Paris_syndrome 0.246216257413
Panth%C3%A9on,_Paris 0.369571497043
From_Paris_with_Love_(film) 0.606250623862
Exposition_Internationale_des_Arts_et_Techniques_dans_la_Vie_Moderne 0.151203212639
University_of_Paris 0.225954669217
Notre_Dame_de_Paris 0.551950911681
Paris_Peace_Accords 0.120936058462
International_Exposition_(1867) 0.017266121693
Fashion_week 0.597838769356
Paris_(Malcolm_McLaren_album) 0.101084103187
Paris_Saint-Germain_F.C. 0.542952239513
Paris_sewers 0.0240961884459
Paris_Combo 0.56066891551
Dakar_Rally 0.0605802647769
Biennale_de_Paris 0.0169372512028
Paris–Brest 0.175204294423
Paris_Sewer_Museum 0.1134365201
15th_arrondissement_of_Paris 0.169871528943
Fashion%20week 0.735459824403
American_Universit