## Create Training data for Mapping between Wikipedia2vec (wordpiece) and BERT embeddings

In [1]:
import json
import torch
import torch.nn as nn

import logging
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaForMaskedLM

In [2]:
embeddings_dict = {}
with open("/data_ssds/disk11/slinzbach/enwiki_20180420_500d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[-500:], "float32")
        embeddings_dict[token] = vector

In [3]:
len(embeddings_dict)

4529833

In [4]:
entities = {}
wordpiece = {}

for key in embeddings_dict.keys():
    
    if "ENTITY/" in key:
        entities[key] = embeddings_dict[key]
        
    else:
        wordpiece[key] = embeddings_dict[key]

In [None]:
from datasets import Dataset

ds = Dataset.from_dict({"Entity": [1]*len(wordpiece), 
                        "label": list(wordpiece.keys()),
                        "embeddings": list(wordpiece.values()) })
ds[0]

In [None]:
ds.add_faiss_index(column='embeddings')

In [12]:
ds.save_to_disk('/data_ssds/disk11/slinzbach/words_faiss')

Saving the dataset (0/8 shards):   0%|          | 0/1937423 [00:00<?, ? examples/s]

In [55]:
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
entities_vecs = np.stack([v for v in entities.values()])

In [5]:
len(entities)

2592410

In [5]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [11]:
len(tokenizer.get_vocab().keys())

50265

In [13]:
letters = set.union(*[set(k) for k in tokenizer.get_vocab()])
letters

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '¡',
 '¢',
 '£',
 '¤',
 '¥',
 '¦',
 '§',
 '¨',
 '©',
 'ª',
 '«',
 '¬',
 '®',
 '¯',
 '°',
 '±',
 '²',
 '³',
 '´',
 'µ',
 '¶',
 '·',
 '¸',
 '¹',
 'º',
 '»',
 '¼',
 '½',
 '¾',
 '¿',
 'À',
 'Á',
 'Â',
 'Ã',
 'Ä',
 'Å',
 'Æ',
 'Ç',
 'È',
 'É',
 'Ê',
 'Ë',
 'Ì',
 'Í',
 'Î',
 'Ï',
 'Ð',
 'Ñ',
 'Ò',
 'Ó',
 'Ô',
 'Õ',
 'Ö',
 '×',
 'Ø',
 'Ù',
 'Ú',
 'Û',
 'Ü',
 'Ý',
 'Þ',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê

In [6]:
common_vocab = []
for tok in tokenizer.get_vocab().keys():
    tok = tok.replace('Ġ', '')
    if "ENTITY/"+tok in entities:
        common_vocab.append("ENTITY/"+tok)
    elif tok.lower() in wordpiece:
        common_vocab.append(tok)
    else:
        common_vocab.append(None)

In [7]:
bert = RobertaForMaskedLM.from_pretrained('roberta-base')

In [56]:
bert.roberta.embeddings

RobertaEmbeddings(
  (word_embeddings): Embedding(50265, 768, padding_idx=1)
  (position_embeddings): Embedding(514, 768, padding_idx=1)
  (token_type_embeddings): Embedding(1, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [36]:
decoder_lm2wiki = nn.Linear(500, 768, bias = False)

In [48]:
decoder_wiki2lm = nn.Linear(768, 500, bias = False)
decoder_wiki2lm.weight = torch.nn.Parameter(decoder_lm2wiki.weight.T)

In [49]:
decoder_wiki2lm.weight

Parameter containing:
tensor([[ 0.0154, -0.0111,  0.0193,  ..., -0.0035,  0.0346, -0.0404],
        [-0.0065,  0.0133,  0.0286,  ...,  0.0333, -0.0361, -0.0017],
        [ 0.0084,  0.0123,  0.0040,  ..., -0.0288,  0.0225,  0.0309],
        ...,
        [ 0.0194,  0.0085, -0.0365,  ...,  0.0398, -0.0375,  0.0289],
        [ 0.0301,  0.0250, -0.0440,  ..., -0.0376, -0.0370,  0.0403],
        [-0.0269,  0.0027, -0.0163,  ..., -0.0293, -0.0301,  0.0427]],
       requires_grad=True)

In [50]:

with torch.no_grad():
    decoder_lm2wiki.weight[:1,:1] = 0

In [52]:
decoder_lm2wiki.weight

Parameter containing:
tensor([[ 0.0000, -0.0065,  0.0084,  ...,  0.0194,  0.0301, -0.0269],
        [-0.0111,  0.0133,  0.0123,  ...,  0.0085,  0.0250,  0.0027],
        [ 0.0193,  0.0286,  0.0040,  ..., -0.0365, -0.0440, -0.0163],
        ...,
        [-0.0035,  0.0333, -0.0288,  ...,  0.0398, -0.0376, -0.0293],
        [ 0.0346, -0.0361,  0.0225,  ..., -0.0375, -0.0370, -0.0301],
        [-0.0404, -0.0017,  0.0309,  ...,  0.0289,  0.0403,  0.0427]],
       requires_grad=True)

In [53]:
decoder_wiki2lm.weight

Parameter containing:
tensor([[ 0.0000, -0.0111,  0.0193,  ..., -0.0035,  0.0346, -0.0404],
        [-0.0065,  0.0133,  0.0286,  ...,  0.0333, -0.0361, -0.0017],
        [ 0.0084,  0.0123,  0.0040,  ..., -0.0288,  0.0225,  0.0309],
        ...,
        [ 0.0194,  0.0085, -0.0365,  ...,  0.0398, -0.0375,  0.0289],
        [ 0.0301,  0.0250, -0.0440,  ..., -0.0376, -0.0370,  0.0403],
        [-0.0269,  0.0027, -0.0163,  ..., -0.0293, -0.0301,  0.0427]],
       requires_grad=True)

In [9]:
i_embeddings = None
for p in bert.roberta.embeddings.word_embeddings.parameters():
    i_embeddings = p
    break


In [10]:
i_embeddings.size()

torch.Size([50265, 768])

In [16]:
common_vocab[8909:]

['MD',
 'immigrant',
 'ENTITY/Construction',
 'Born',
 None,
 'Wade',
 'visa',
 'genuine',
 'electronics',
 'Sat',
 'sponsors',
 'ENTITY/Montana',
 'spell',
 'ENTITY/Sachs',
 'Et',
 'foster',
 'locker',
 'explaining',
 'Age',
 'gunman',
 'sauce',
 'cry',
 'stimulus',
 'array',
 'compare',
 'boats',
 'ext',
 None,
 'Ast',
 'Parks',
 'ester',
 '94',
 'relating',
 'vegetables',
 'accountable',
 'hyper',
 'Wim',
 'newest',
 'ENTITY/Rome',
 'ENTITY/Chancellor',
 'ENTITY/CBS',
 'businessman',
 'ENTITY/Delaware',
 'lands',
 'court',
 'aria',
 'approaching',
 'cker',
 'ENTITY/Salt',
 'Mak',
 'treating',
 'subsequently',
 'ENTITY/Ell',
 None,
 'ENTITY/180',
 'determination',
 'Salman',
 'Joel',
 'classified',
 'span',
 'earthquake',
 'ranked',
 '96',
 'ENTITY/Tiger',
 'advocacy',
 'mit',
 'colleges',
 'Yeah',
 'ENTITY/Captain',
 'orange',
 'projections',
 'electrical',
 'MA',
 'olog',
 'Newcastle',
 None,
 'representation',
 'lawsuits',
 'just',
 'aced',
 'Race',
 'Aqu',
 'Bills',
 'exclusively

In [14]:
mapping_train_data = []
for i, word in enumerate(common_vocab):
    
    if not word:
        continue
    
    if "ENTITY/" in word:
        w2v_emb = entities[word]
    else:
        w2v_emb = wordpiece[word.lower()]
        
    bert_emb = torch.squeeze(torch.squeeze(i_embeddings[i], dim = 0),dim = 0).detach().numpy()
    print("bert norm", np.linalg.norm(bert_emb))
    print("w2v_emb norm", np.linalg.norm(w2v_emb))
    #bert_emb = bert_emb/np.linalg.norm(bert_emb)
    #w2v_emb = w2v_emb/np.linalg.norm(w2v_emb)
    print(word)
    data_point={'word':word, 'wikipedia2vec':w2v_emb.tolist(), 'bert': bert_emb.tolist()}
    mapping_train_data.append(data_point)

len(mapping_train_data)

bert norm 2.295584
w2v_emb norm 2.3358235
the
bert norm 2.4234285
w2v_emb norm 2.8124087
to
bert norm 2.458268
w2v_emb norm 2.160507
and
bert norm 2.373503
w2v_emb norm 2.644827
of
bert norm 2.432008
w2v_emb norm 2.7094784
a
bert norm 2.3928869
w2v_emb norm 2.5782495
in
bert norm 2.44866
w2v_emb norm 2.9037478
for
bert norm 2.368081
w2v_emb norm 2.7709205
that
bert norm 2.4771185
w2v_emb norm 3.0100698
on
bert norm 2.3171222
w2v_emb norm 3.0193682
is
bert norm 2.438728
w2v_emb norm 2.7848876
with
bert norm 2.246134
w2v_emb norm 8.275168
ENTITY/The
bert norm 2.3470695
w2v_emb norm 2.935463
was
bert norm 2.529669
w2v_emb norm 3.308615
at
bert norm 2.4053588
w2v_emb norm 2.8312457
it
bert norm 2.5120802
w2v_emb norm 2.811539
as
bert norm 2.5908916
w2v_emb norm 3.42975
said
bert norm 2.7013094
w2v_emb norm 4.2591386
Ļ
bert norm 2.5852978
w2v_emb norm 3.3160017
be
bert norm 2.9197552
w2v_emb norm 3.9846184
s
bert norm 2.5999954
w2v_emb norm 3.0072646
by
bert norm 2.590858
w2v_emb norm 3.015

w2v_emb norm 6.0762596
ENTITY/25
bert norm 3.4294758
w2v_emb norm 4.5914326
land
bert norm 3.2297616
w2v_emb norm 3.290288
example
bert norm 2.9957275
w2v_emb norm 4.250212
authorities
bert norm 3.3550773
w2v_emb norm 4.1216173
date
bert norm 2.903838
w2v_emb norm 3.93676
ended
bert norm 3.4179976
w2v_emb norm 3.258799
all
bert norm 3.3974042
w2v_emb norm 8.680755
ENTITY/Reuters
bert norm 2.891892
w2v_emb norm 4.585013
businesses
bert norm 3.430973
w2v_emb norm 5.292891
ans
bert norm 3.1072018
w2v_emb norm 4.281964
details
bert norm 3.5090587
w2v_emb norm 4.5585527
ground
bert norm 2.9130442
w2v_emb norm 4.335264
pretty
bert norm 3.1980975
w2v_emb norm 3.3882682
ENTITY/Apple
bert norm 3.1604936
w2v_emb norm 5.4046965
ation
bert norm 2.9943008
w2v_emb norm 3.7822723
Smith
bert norm 3.2221107
w2v_emb norm 10.111975
ENTITY/Company
bert norm 2.9200857
w2v_emb norm 7.794873
ENTITY/Florida
bert norm 3.2180526
w2v_emb norm 5.1413093
drug
bert norm 3.1103508
w2v_emb norm 3.9193423
response
ber

bert norm 3.3143291
w2v_emb norm 4.8546414
Ms
bert norm 3.4692912
w2v_emb norm 4.353424
word
bert norm 3.5070336
w2v_emb norm 3.5300364
over
bert norm 2.9841309
w2v_emb norm 4.6817555
brother
bert norm 3.0067732
w2v_emb norm 4.0264635
necessary
bert norm 2.8216634
w2v_emb norm 3.3199003
eventually
bert norm 3.3115864
w2v_emb norm 10.171069
ENTITY/Star
bert norm 3.1449044
w2v_emb norm 4.5027895
send
bert norm 3.0361006
w2v_emb norm 4.668484
boy
bert norm 3.5714757
w2v_emb norm 5.3807216
Rs
bert norm 3.1788907
w2v_emb norm 4.1250362
remember
bert norm 2.785026
w2v_emb norm 3.4030986
21
bert norm 3.332037
w2v_emb norm 5.6939135
climate
bert norm 3.4435596
w2v_emb norm 4.8344
capacity
bert norm 3.4100208
w2v_emb norm 4.6096225
responsible
bert norm 3.1882706
w2v_emb norm 4.6089993
Matt
bert norm 3.1428287
w2v_emb norm 4.31078
month
bert norm 3.0436125
w2v_emb norm 4.8499246
suffered
bert norm 3.5821562
w2v_emb norm 6.1140237
og
bert norm 3.075008
w2v_emb norm 4.2434807
Peter
bert norm 3.20

bert norm 3.176454
w2v_emb norm 7.538668
ENTITY/Scotland
bert norm 3.7308326
w2v_emb norm 5.0603123
round
bert norm 3.8411267
w2v_emb norm 5.6594124
ith
bert norm 3.3166041
w2v_emb norm 4.4713545
breaking
bert norm 3.4143505
w2v_emb norm 5.8420396
voting
bert norm 3.1832745
w2v_emb norm 4.7515655
producer
bert norm 3.2126741
w2v_emb norm 11.052486
ENTITY/Love
bert norm 3.027877
w2v_emb norm 4.50513
remove
bert norm 3.6056376
w2v_emb norm 5.8436317
PA
bert norm 3.394309
w2v_emb norm 4.9609904
asset
bert norm 2.9534483
w2v_emb norm 4.437573
requires
bert norm 3.461178
w2v_emb norm 4.8853865
signing
bert norm 3.7976816
w2v_emb norm 5.1918936
ages
bert norm 2.7878768
w2v_emb norm 4.356736
impressive
bert norm 3.2835624
w2v_emb norm 4.896492
Irish
bert norm 3.3320696
w2v_emb norm 4.92522
authority
bert norm 3.3365135
w2v_emb norm 5.050126
ruled
bert norm 3.200198
w2v_emb norm 4.4624386
aimed
bert norm 3.1689875
w2v_emb norm 4.5389647
captain
bert norm 3.6633332
w2v_emb norm 5.521377
AG
bert

bert norm 3.4539208
w2v_emb norm 11.303939
ENTITY/Memorial
bert norm 3.5267358
w2v_emb norm 5.0934043
wave
bert norm 3.1765416
w2v_emb norm 4.4189963
fears
bert norm 3.0898993
w2v_emb norm 4.682375
kid
bert norm 3.1972117
w2v_emb norm 5.2691474
Giants
bert norm 3.2174737
w2v_emb norm 4.507072
recovered
bert norm 3.9486482
w2v_emb norm 5.230972
row
bert norm 3.495981
w2v_emb norm 10.169835
ENTITY/Radio
bert norm 3.1965997
w2v_emb norm 8.204132
ENTITY/Barcelona
bert norm 2.6822906
w2v_emb norm 4.3624296
wonderful
bert norm 3.71298
w2v_emb norm 5.2737474
Dow
bert norm 3.536685
w2v_emb norm 4.884449
stream
bert norm 3.328083
w2v_emb norm 4.454031
Simon
bert norm 3.4247267
w2v_emb norm 4.2413945
detail
bert norm 3.2741938
w2v_emb norm 4.4592385
volunteers
bert norm 3.5723298
w2v_emb norm 6.5623784
Ind
bert norm 3.3501508
w2v_emb norm 4.3774586
forms
bert norm 3.551913
w2v_emb norm 4.544881
mann
bert norm 3.3548763
w2v_emb norm 4.8659344
Ray
bert norm 3.8120244
w2v_emb norm 6.860204
oor
bert

bert norm 3.4075253
w2v_emb norm 4.5511303
locked
bert norm 3.598574
w2v_emb norm 5.8436317
PA
bert norm 3.260253
w2v_emb norm 4.879338
landed
bert norm 3.6259346
w2v_emb norm 5.0402117
length
bert norm 3.0517988
w2v_emb norm 4.362589
boosted
bert norm 3.162899
w2v_emb norm 4.44063
purchases
bert norm 3.6498609
w2v_emb norm 4.9395304
command
bert norm 3.7047021
w2v_emb norm 4.000013
Asked
bert norm 3.316967
w2v_emb norm 4.875257
spaces
bert norm 2.9168994
w2v_emb norm 4.222538
iconic
bert norm 3.4537761
w2v_emb norm 4.578525
recommend
bert norm 3.3590765
w2v_emb norm 5.013017
duties
bert norm 3.3206363
w2v_emb norm 4.627898
seized
bert norm 3.2175806
w2v_emb norm 4.5053754
delayed
bert norm 3.6995773
w2v_emb norm 6.284594
FA
bert norm 3.7215316
w2v_emb norm 2.160507
AND
bert norm 3.7167294
w2v_emb norm 5.6852193
daq
bert norm 3.4079938
w2v_emb norm 4.518485
hiring
bert norm 3.0588734
w2v_emb norm 4.6848907
occur
bert norm 3.58889
w2v_emb norm 5.3834662
DC
bert norm 3.5786493
w2v_emb no

bert norm 3.8063085
w2v_emb norm 4.4952292
core
bert norm 3.4831226
w2v_emb norm 5.0314703
gallery
bert norm 3.3085978
w2v_emb norm 4.7797856
founder
bert norm 3.7045224
w2v_emb norm 9.023434
ENTITY/Vill
bert norm 2.9663754
w2v_emb norm 4.604864
decent
bert norm 3.3665273
w2v_emb norm 9.008939
ENTITY/History
bert norm 3.4618528
w2v_emb norm 5.5831075
Int
bert norm 3.6804297
w2v_emb norm 5.89472
Na
bert norm 3.5518465
w2v_emb norm 3.2787092
Had
bert norm 3.2130198
w2v_emb norm 4.8242335
mainstream
bert norm 3.7896793
w2v_emb norm 5.3407383
Ts
bert norm 3.2363381
w2v_emb norm 4.664545
bottle
bert norm 3.9196327
w2v_emb norm 5.5481834
sen
bert norm 3.2971017
w2v_emb norm 5.39775
recession
bert norm 3.239166
w2v_emb norm 5.2656336
sophomore
bert norm 3.5880694
w2v_emb norm 4.508199
silence
bert norm 3.7434397
w2v_emb norm 5.2835503
cc
bert norm 3.496919
w2v_emb norm 5.8670745
qualifying
bert norm 3.1694195
w2v_emb norm 4.3145175
complained
bert norm 3.5032578
w2v_emb norm 5.2996635
Rad
ber

bert norm 3.932546
w2v_emb norm 5.94839
hing
bert norm 3.9220355
w2v_emb norm 4.654185
chen
bert norm 3.4651208
w2v_emb norm 4.4540563
differently
bert norm 3.1317725
w2v_emb norm 6.3198524
championships
bert norm 3.6974275
w2v_emb norm 5.0328617
Eng
bert norm 3.4519932
w2v_emb norm 4.0566745
NO
bert norm 3.5906875
w2v_emb norm 5.5245066
Auto
bert norm 3.3793292
w2v_emb norm 5.962156
Erdogan
bert norm 3.6754122
w2v_emb norm 3.1421769
iding
bert norm 3.645834
w2v_emb norm 5.320926
warming
bert norm 3.535865
w2v_emb norm 5.0951815
civilian
bert norm 3.6747572
w2v_emb norm 11.026324
ENTITY/Dam
bert norm 3.4177766
w2v_emb norm 5.281484
fantasy
bert norm 3.7279925
w2v_emb norm 6.0794535
Nav
bert norm 3.4164674
w2v_emb norm 4.122686
Drew
bert norm 3.22968
w2v_emb norm 4.4036403
Nancy
bert norm 3.1967943
w2v_emb norm 4.6311603
trapped
bert norm 3.360697
w2v_emb norm 10.438747
ENTITY/Russians
bert norm 3.6000588
w2v_emb norm 5.9190383
IC
bert norm 3.279228
w2v_emb norm 4.688023
flexibility
ber

bert norm 3.197355
w2v_emb norm 3.773922
initiated
bert norm 3.8352945
w2v_emb norm 5.9747424
Ku
bert norm 3.5624151
w2v_emb norm 8.4611635
ENTITY/Florence
bert norm 3.901886
w2v_emb norm 7.6614227
yd
bert norm 3.633981
w2v_emb norm 4.6460824
Fast
bert norm 3.0225317
w2v_emb norm 4.674326
musician
bert norm 3.4256885
w2v_emb norm 7.7527823
ENTITY/Chile
bert norm 3.811371
w2v_emb norm 6.63649
anga
bert norm 3.318079
w2v_emb norm 5.067562
dairy
bert norm 3.2822351
w2v_emb norm 4.929578
contractors
bert norm 4.0290756
w2v_emb norm 5.9335403
ador
bert norm 3.4494255
w2v_emb norm 10.692319
ENTITY/Planning
bert norm 3.3817945
w2v_emb norm 5.192912
ultra
bert norm 3.4789672
w2v_emb norm 5.0372896
prayer
bert norm 3.1946933
w2v_emb norm 4.0565825
suggestions
bert norm 3.7909794
w2v_emb norm 6.332353
Ek
bert norm 3.30642
w2v_emb norm 5.3612967
random
bert norm 3.4599102
w2v_emb norm 4.339184
Sullivan
bert norm 3.3335507
w2v_emb norm 5.3524156
sensor
bert norm 3.4555671
w2v_emb norm 5.5047398
ho

bert norm 3.8451111
w2v_emb norm 3.5610943
ENTITY/DR
bert norm 3.596533
w2v_emb norm 5.1656804
dip
bert norm 3.4541829
w2v_emb norm 5.4552355
RAM
bert norm 3.6223433
w2v_emb norm 4.8396416
Christie
bert norm 3.1025155
w2v_emb norm 4.260114
argues
bert norm 3.6966748
w2v_emb norm 4.574855
EX
bert norm 3.3591826
w2v_emb norm 3.689119
Nine
bert norm 3.9748077
w2v_emb norm 10.186512
ENTITY/Scroll
bert norm 3.2672665
w2v_emb norm 2.7156942
THIS
bert norm 3.551912
w2v_emb norm 5.502482
Pro
bert norm 3.3688629
w2v_emb norm 4.743278
keys
bert norm 3.4576018
w2v_emb norm 5.7516565
processor
bert norm 3.458502
w2v_emb norm 5.3452225
scam
bert norm 3.4788358
w2v_emb norm 10.329397
ENTITY/Training
bert norm 3.343461
w2v_emb norm 4.78218
honey
bert norm 3.6330738
w2v_emb norm 6.1895466
ENTITY/Ĵ
bert norm 3.517541
w2v_emb norm 5.392401
facebook
bert norm 3.6012616
w2v_emb norm 4.488465
Legal
bert norm 3.5312054
w2v_emb norm 4.6845374
aging
bert norm 3.213962
w2v_emb norm 4.868976
spiritual
bert norm

bert norm 3.5245116
w2v_emb norm 4.8700213
humor
bert norm 3.2641175
w2v_emb norm 4.9650507
needing
bert norm 3.3361146
w2v_emb norm 5.7095203
midterm
bert norm 3.836837
w2v_emb norm 11.245685
ENTITY/Oval
bert norm 3.6313636
w2v_emb norm 4.681668
corners
bert norm 3.495885
w2v_emb norm 5.8544927
tablets
bert norm 3.9357352
w2v_emb norm 5.737472
eds
bert norm 4.146097
w2v_emb norm 5.6989903
vere
bert norm 3.5753784
w2v_emb norm 4.8219757
attacker
bert norm 3.3418548
w2v_emb norm 4.285258
Paul
bert norm 4.020181
w2v_emb norm 5.6535435
pee
bert norm 3.2498403
w2v_emb norm 4.440408
Alice
bert norm 2.9326448
w2v_emb norm 4.151047
renowned
bert norm 3.388231
w2v_emb norm 4.953359
09
bert norm 3.6002774
w2v_emb norm 5.221168
creditors
bert norm 3.6666062
w2v_emb norm 10.392401
ENTITY/Pedro
bert norm 3.5144591
w2v_emb norm 4.961973
Phone
bert norm 3.4282763
w2v_emb norm 5.1979437
surveys
bert norm 3.4645565
w2v_emb norm 5.063035
Welsh
bert norm 3.4887667
w2v_emb norm 4.929612
cow
bert norm 3.3

bert norm 3.494216
w2v_emb norm 4.002091
ENTITY/HP
bert norm 3.724267
w2v_emb norm 5.0075293
Roll
bert norm 3.794628
w2v_emb norm 4.5535836
Fay
bert norm 3.647901
w2v_emb norm 5.071343
Clare
bert norm 3.572147
w2v_emb norm 5.6249514
haul
bert norm 3.7053885
w2v_emb norm 5.183664
riot
bert norm 3.5357184
w2v_emb norm 5.692186
settlements
bert norm 3.3846195
w2v_emb norm 4.928107
norm
bert norm 3.4380162
w2v_emb norm 4.6931
accelerated
bert norm 3.864195
w2v_emb norm 7.8677955
Lok
bert norm 2.9706893
w2v_emb norm 4.345789
clever
bert norm 3.6703684
w2v_emb norm 5.8617744
hyd
bert norm 3.2230382
w2v_emb norm 6.897449
stats
bert norm 3.5734177
w2v_emb norm 5.254172
Hull
bert norm 3.7482297
w2v_emb norm 7.017179
kers
bert norm 3.4874895
w2v_emb norm 4.666345
buys
bert norm 4.1229177
w2v_emb norm 4.2403297
uter
bert norm 3.549327
w2v_emb norm 6.865173
fue
bert norm 3.4010146
w2v_emb norm 5.4022703
https
bert norm 3.9224129
w2v_emb norm 6.72292
UD
bert norm 3.558939
w2v_emb norm 4.6221247
iso

w2v_emb norm 6.7719293
Gol
bert norm 3.6890738
w2v_emb norm 5.4777646
Techn
bert norm 4.2054834
w2v_emb norm 5.458852
lis
bert norm 3.5553062
w2v_emb norm 4.7143593
orientation
bert norm 4.1070814
w2v_emb norm 9.112274
ENTITY/Arri
bert norm 3.653641
w2v_emb norm 5.591292
PG
bert norm 4.17792
w2v_emb norm 4.286972
ross
bert norm 3.5226169
w2v_emb norm 5.5033197
sank
bert norm 4.180595
w2v_emb norm 4.935818
LOS
bert norm 3.4425776
w2v_emb norm 4.477438
Allison
bert norm 3.2628448
w2v_emb norm 4.883606
smiles
bert norm 3.9794273
w2v_emb norm 5.2593417
USD
bert norm 3.629891
w2v_emb norm 5.5271225
kits
bert norm 3.6759117
w2v_emb norm 10.649564
ENTITY/Bar
bert norm 3.7367811
w2v_emb norm 5.9515343
Bri
bert norm 3.6062543
w2v_emb norm 6.2499185
ounces
bert norm 3.7608755
w2v_emb norm 5.9968534
Nielsen
bert norm 4.1146865
w2v_emb norm 5.4320874
eno
bert norm 3.2072175
w2v_emb norm 4.3940935
109
bert norm 3.323852
w2v_emb norm 4.9068813
norms
bert norm 3.5904992
w2v_emb norm 4.6356487
skip
be

bert norm 3.4285426
w2v_emb norm 5.1420336
consortium
bert norm 4.0992336
w2v_emb norm 6.054444
obi
bert norm 3.5426805
w2v_emb norm 11.070008
ENTITY/Monster
bert norm 4.0073366
w2v_emb norm 5.2139325
arks
bert norm 3.879094
w2v_emb norm 3.7934892
turn
bert norm 3.5276406
w2v_emb norm 4.591985
sketch
bert norm 3.4072282
w2v_emb norm 4.4656806
predicting
bert norm 3.3179774
w2v_emb norm 5.0763674
minimize
bert norm 3.314865
w2v_emb norm 4.7681293
Ethan
bert norm 4.0697885
w2v_emb norm 5.1173425
anson
bert norm 3.5884898
w2v_emb norm 5.0246053
Adjusted
bert norm 3.566746
w2v_emb norm 5.923488
Hornets
bert norm 3.650975
w2v_emb norm 4.992916
NZ
bert norm 3.5980222
w2v_emb norm 4.536018
Kathleen
bert norm 3.7600927
w2v_emb norm 5.7111487
Kier
bert norm 3.7873306
w2v_emb norm 5.490627
Mercury
bert norm 3.4893515
w2v_emb norm 4.707493
ghost
bert norm 3.827203
w2v_emb norm 5.888342
haw
bert norm 3.7626297
w2v_emb norm 9.934489
ENTITY/Demand
bert norm 3.4198904
w2v_emb norm 4.507626
Collection

bert norm 4.029734
w2v_emb norm 7.4029202
ENTITY/Ans
bert norm 3.7848253
w2v_emb norm 5.437637
whistle
bert norm 3.4418714
w2v_emb norm 4.320875
symbolic
bert norm 3.634044
w2v_emb norm 4.8865275
possessions
bert norm 3.7562168
w2v_emb norm 4.8006883
Driver
bert norm 3.67424
w2v_emb norm 5.7837863
bracket
bert norm 3.9820704
w2v_emb norm 11.185498
ENTITY/Reign
bert norm 3.825624
w2v_emb norm 6.82102
oji
bert norm 3.699796
w2v_emb norm 4.657217
oct
bert norm 3.5052679
w2v_emb norm 5.474381
tube
bert norm 3.4822688
w2v_emb norm 4.7301593
Felix
bert norm 3.6377225
w2v_emb norm 4.9914994
translated
bert norm 3.2739294
w2v_emb norm 3.7181404
promptly
bert norm 3.595713
w2v_emb norm 9.500163
ENTITY/Ernest
bert norm 4.1277523
w2v_emb norm 6.8869214
arth
bert norm 3.3110504
w2v_emb norm 4.8496485
dumb
bert norm 3.3610718
w2v_emb norm 4.7279325
influences
bert norm 3.698474
w2v_emb norm 3.701701
taking
bert norm 3.5015173
w2v_emb norm 6.4961405
privat
bert norm 3.394549
w2v_emb norm 6.176595
ma

bert norm 3.4862924
w2v_emb norm 4.3042474
signaled
bert norm 3.9381533
w2v_emb norm 6.279396
Fors
bert norm 3.3367188
w2v_emb norm 4.9841805
speedy
bert norm 4.2539835
w2v_emb norm 5.4528265
rang
bert norm 3.8408904
w2v_emb norm 5.3259063
FT
bert norm 3.2443426
w2v_emb norm 4.5031967
selecting
bert norm 3.5778306
w2v_emb norm 5.806641
pale
bert norm 4.025792
w2v_emb norm 6.101575
WD
bert norm 3.5216317
w2v_emb norm 5.5884266
probability
bert norm 3.9928195
w2v_emb norm 2.7209902
OUND
bert norm 3.7207904
w2v_emb norm 4.5511966
istrate
bert norm 3.4644504
w2v_emb norm 5.6056986
sens
bert norm 3.6104448
w2v_emb norm 4.6122856
interpret
bert norm 3.4826896
w2v_emb norm 5.1100554
puzzle
bert norm 3.5098217
w2v_emb norm 4.9704633
inland
bert norm 3.342766
w2v_emb norm 4.6718664
manipulation
bert norm 3.805798
w2v_emb norm 5.290841
Sal
bert norm 3.4092517
w2v_emb norm 4.516552
fulfilling
bert norm 3.7584493
w2v_emb norm 5.1613107
McMaster
bert norm 3.3691216
w2v_emb norm 3.7424982
Make
bert 

bert norm 3.6975737
w2v_emb norm 5.3752155
002
bert norm 3.8086188
w2v_emb norm 4.8337865
disl
bert norm 3.7890358
w2v_emb norm 4.948234
Lowry
bert norm 3.7196574
w2v_emb norm 10.480582
ENTITY/Demon
bert norm 3.331276
w2v_emb norm 2.8306465
Nonetheless
bert norm 4.112747
w2v_emb norm 4.726744
arro
bert norm 3.9259064
w2v_emb norm 6.24877
CONT
bert norm 3.658768
w2v_emb norm 4.6050553
Freder
bert norm 4.188353
w2v_emb norm 3.9709585
isson
bert norm 3.6755836
w2v_emb norm 5.364124
rout
bert norm 4.0781837
w2v_emb norm 5.8425775
ARA
bert norm 3.5266802
w2v_emb norm 4.687772
swinging
bert norm 3.4786572
w2v_emb norm 4.657217
Oct
bert norm 3.710656
w2v_emb norm 5.746151
liable
bert norm 3.4049935
w2v_emb norm 4.83421
leaning
bert norm 3.6091983
w2v_emb norm 5.4572344
lungs
bert norm 3.4765673
w2v_emb norm 6.3765607
ENTITY/380
bert norm 3.680862
w2v_emb norm 7.5982246
ENTITY/Process
bert norm 3.90455
w2v_emb norm 6.7247076
Cov
bert norm 3.4787738
w2v_emb norm 5.2959633
terrorism
bert norm 3.

bert norm 3.9471793
w2v_emb norm 6.575626
ods
bert norm 3.876049
w2v_emb norm 4.533743
utsch
bert norm 3.7104201
w2v_emb norm 5.3951116
terminals
bert norm 3.7324603
w2v_emb norm 4.46322
Baird
bert norm 3.7899776
w2v_emb norm 6.541867
hast
bert norm 3.6664362
w2v_emb norm 4.897425
brass
bert norm 3.5640795
w2v_emb norm 5.559782
parental
bert norm 3.9490747
w2v_emb norm 4.5248213
Conduct
bert norm 3.3438857
w2v_emb norm 4.711786
expands
bert norm 4.0473895
w2v_emb norm 4.771419
luck
bert norm 4.178716
w2v_emb norm 6.350318
mur
bert norm 3.9335344
w2v_emb norm 5.117537
Bj
bert norm 3.5928597
w2v_emb norm 5.213794
administrations
bert norm 4.025626
w2v_emb norm 5.2474566
Olivier
bert norm 3.611523
w2v_emb norm 5.189185
narrowed
bert norm 3.950143
w2v_emb norm 4.9404564
winner
bert norm 3.3420668
w2v_emb norm 4.74638
makeshift
bert norm 3.9068522
w2v_emb norm 5.9034433
VAT
bert norm 3.726406
w2v_emb norm 4.9016986
Javier
bert norm 3.470083
w2v_emb norm 5.6457486
systematic
bert norm 3.5200

bert norm 3.9656196
w2v_emb norm 6.9487443
uchi
bert norm 3.480225
w2v_emb norm 4.6610165
transporting
bert norm 3.5299814
w2v_emb norm 5.1420746
speculative
bert norm 4.1346765
w2v_emb norm 5.9102664
Sek
bert norm 4.233459
w2v_emb norm 6.3538313
abal
bert norm 3.505653
w2v_emb norm 5.1273456
shipment
bert norm 4.001187
w2v_emb norm 6.252734
oker
bert norm 3.820872
w2v_emb norm 6.265654
warranty
bert norm 4.2230754
w2v_emb norm 6.5895443
atan
bert norm 3.7223089
w2v_emb norm 5.5665026
blister
bert norm 3.5403576
w2v_emb norm 4.195616
Celebration
bert norm 3.6942997
w2v_emb norm 5.570799
wal
bert norm 3.916913
w2v_emb norm 6.024508
lac
bert norm 3.521848
w2v_emb norm 5.453978
prioritize
bert norm 3.7707524
w2v_emb norm 8.728297
ENTITY/BP
bert norm 3.2255375
w2v_emb norm 4.675878
collaborated
bert norm 3.6772325
w2v_emb norm 10.351803
ENTITY/Newsletter
bert norm 3.8913896
w2v_emb norm 4.9434576
Damian
bert norm 3.7761364
w2v_emb norm 4.966724
Residential
bert norm 3.9776685
w2v_emb norm 

bert norm 3.991466
w2v_emb norm 6.609004
Sne
bert norm 3.4315987
w2v_emb norm 5.0980325
misinformation
bert norm 3.8660116
w2v_emb norm 5.921498
Sinai
bert norm 3.828428
w2v_emb norm 6.055824
nitrogen
bert norm 3.4040563
w2v_emb norm 4.5748944
203
bert norm 3.5711367
w2v_emb norm 4.340249
escaping
bert norm 3.5648708
w2v_emb norm 5.5984125
junction
bert norm 3.6926882
w2v_emb norm 5.192936
Santana
bert norm 3.8206568
w2v_emb norm 5.622892
Yemeni
bert norm 3.7025998
w2v_emb norm 5.3086185
whipped
bert norm 3.669417
w2v_emb norm 8.413751
ENTITY/Stephenson
bert norm 3.3083835
w2v_emb norm 4.9860444
attire
bert norm 3.8973656
w2v_emb norm 11.272134
ENTITY/Bard
bert norm 3.9389737
w2v_emb norm 6.0011873
Faul
bert norm 3.8052666
w2v_emb norm 6.371366
Sym
bert norm 4.119891
w2v_emb norm 6.39312
resh
bert norm 3.6905048
w2v_emb norm 6.0454736
MG
bert norm 3.6990006
w2v_emb norm 5.047913
Sub
bert norm 3.7338922
w2v_emb norm 10.685519
ENTITY/Carmen
bert norm 4.0852594
w2v_emb norm 5.670179
ig
be

bert norm 3.9623742
w2v_emb norm 6.733375
ohl
bert norm 3.5226376
w2v_emb norm 5.2495184
instincts
bert norm 3.7887075
w2v_emb norm 5.686697
Poo
bert norm 4.2281556
w2v_emb norm 6.198066
nih
bert norm 3.8748994
w2v_emb norm 3.289924
esting
bert norm 3.9426713
w2v_emb norm 5.7382445
asses
bert norm 3.716414
w2v_emb norm 4.1016946
Introduction
bert norm 3.8530054
w2v_emb norm 11.965565
ENTITY/Sirius
bert norm 3.4008276
w2v_emb norm 3.8242912
Local
bert norm 3.8493314
w2v_emb norm 4.731348
rehearsal
bert norm 3.6149027
w2v_emb norm 5.4215083
demol
bert norm 3.68627
w2v_emb norm 5.4980907
traffickers
bert norm 3.311618
w2v_emb norm 4.727827
upsetting
bert norm 3.7341862
w2v_emb norm 5.475884
heir
bert norm 3.8210735
w2v_emb norm 4.2477536
death
bert norm 3.6549442
w2v_emb norm 4.6231313
Moments
bert norm 3.5275128
w2v_emb norm 4.4514956
ENTITY/Los
bert norm 3.5822864
w2v_emb norm 5.6003823
atmospheric
bert norm 4.091201
w2v_emb norm 4.20703
aints
bert norm 3.646956
w2v_emb norm 4.985126
Di

bert norm 3.575889
w2v_emb norm 5.126199
spoon
bert norm 4.1336646
w2v_emb norm 6.02943
sha
bert norm 3.6908927
w2v_emb norm 5.038359
dismantle
bert norm 4.097113
w2v_emb norm 6.597078
elta
bert norm 3.5554628
w2v_emb norm 5.251175
jar
bert norm 3.986702
w2v_emb norm 4.81338
space
bert norm 3.6631944
w2v_emb norm 4.78889
Smart
bert norm 4.196503
w2v_emb norm 4.5760508
mere
bert norm 3.5402462
w2v_emb norm 6.728775
Ð
bert norm 3.7655268
w2v_emb norm 4.790197
Gillespie
bert norm 3.7766252
w2v_emb norm 5.2715197
Lo
bert norm 3.9682603
w2v_emb norm 10.9396305
ENTITY/Mead
bert norm 4.0116506
w2v_emb norm 4.8344
capacity
bert norm 3.7423375
w2v_emb norm 4.3372984
Issue
bert norm 3.8615239
w2v_emb norm 6.4498515
050
bert norm 3.9521255
w2v_emb norm 6.6336465
Vall
bert norm 3.4146504
w2v_emb norm 5.15764
meme
bert norm 3.7588258
w2v_emb norm 5.480621
pard
bert norm 3.7819746
w2v_emb norm 4.5572615
compensated
bert norm 3.8966765
w2v_emb norm 6.4443994
Ket
bert norm 3.976939
w2v_emb norm 4.2360

bert norm 4.1182556
w2v_emb norm 5.2600303
ENTITY/ITIES
bert norm 3.6342885
w2v_emb norm 3.7298906
Suddenly
bert norm 3.3365164
w2v_emb norm 5.2691317
foray
bert norm 4.304761
w2v_emb norm 5.401277
pell
bert norm 3.8305337
w2v_emb norm 5.601649
licensed
bert norm 3.9788356
w2v_emb norm 5.975683
fra
bert norm 3.541122
w2v_emb norm 4.782477
blasting
bert norm 3.7420068
w2v_emb norm 10.930681
ENTITY/Blizzard
bert norm 4.194598
w2v_emb norm 3.61634
orer
bert norm 3.6137288
w2v_emb norm 5.763979
chili
bert norm 3.7238307
w2v_emb norm 4.7383795
Sylvia
bert norm 3.5595994
w2v_emb norm 3.9483795
except
bert norm 4.0240645
w2v_emb norm 5.359581
tec
bert norm 3.8119812
w2v_emb norm 5.010371
Resistance
bert norm 3.6889951
w2v_emb norm 3.998397
young
bert norm 3.6097639
w2v_emb norm 4.508248
Dreams
bert norm 3.7631457
w2v_emb norm 4.955177
Archives
bert norm 3.5509164
w2v_emb norm 5.2382874
unleash
bert norm 3.7456813
w2v_emb norm 5.7056856
Pract
bert norm 3.497879
w2v_emb norm 4.2365856
likened
b

bert norm 3.4433594
w2v_emb norm 5.2083673
ravaged
bert norm 3.887733
w2v_emb norm 4.3528447
limited
bert norm 3.3127215
w2v_emb norm 5.028579
rituals
bert norm 3.670842
w2v_emb norm 10.280317
ENTITY/Knowledge
bert norm 3.8625271
w2v_emb norm 9.662294
ENTITY/Utility
bert norm 3.7191513
w2v_emb norm 5.1172934
doom
bert norm 3.8379292
w2v_emb norm 4.8735795
sheds
bert norm 4.0357866
w2v_emb norm 6.267608
Gael
bert norm 3.5039065
w2v_emb norm 8.517336
ENTITY/Millennials
bert norm 3.932412
w2v_emb norm 5.0641384
Monthly
bert norm 3.3777583
w2v_emb norm 4.9036655
domination
bert norm 3.8002675
w2v_emb norm 5.682343
rapport
bert norm 4.0667343
w2v_emb norm 4.5722747
spot
bert norm 4.003835
w2v_emb norm 5.9720874
Prest
bert norm 3.5870876
w2v_emb norm 5.7649274
HA
bert norm 3.734868
w2v_emb norm 5.2358522
tact
bert norm 3.2552276
w2v_emb norm 14.522749
ENTITY/Richard
bert norm 3.608889
w2v_emb norm 4.985634
gritty
bert norm 3.1879015
w2v_emb norm 4.147663
Does
bert norm 3.7881749
w2v_emb norm

w2v_emb norm 5.830069
payload
bert norm 3.6106713
w2v_emb norm 4.88554
ENTITY/227
bert norm 3.7800875
w2v_emb norm 5.52024
livestream
bert norm 4.2413387
w2v_emb norm 6.7009354
ORN
bert norm 3.7279851
w2v_emb norm 4.784196
Abel
bert norm 3.3289905
w2v_emb norm 4.705879
deception
bert norm 3.449129
w2v_emb norm 4.400207
Britain
bert norm 3.9428384
w2v_emb norm 5.431004
partisan
bert norm 3.546568
w2v_emb norm 5.6655617
browse
bert norm 3.7623124
w2v_emb norm 6.49551
melan
bert norm 3.5147123
w2v_emb norm 4.5033937
172
bert norm 3.7291024
w2v_emb norm 3.7273777
Numerous
bert norm 3.931646
w2v_emb norm 11.088363
ENTITY/Mansion
bert norm 3.616883
w2v_emb norm 5.151376
assailants
bert norm 3.7091196
w2v_emb norm 4.980937
directives
bert norm 3.8171785
w2v_emb norm 2.6791716
Integ
bert norm 4.032264
w2v_emb norm 2.1985765
zers
bert norm 4.0729733
w2v_emb norm 5.8612084
duct
bert norm 3.2997248
w2v_emb norm 4.481842
Honestly
bert norm 3.693274
w2v_emb norm 3.6913812
Immediately
bert norm 3.66

bert norm 4.020059
w2v_emb norm 5.4481354
RESP
bert norm 3.5866213
w2v_emb norm 4.768446
239
bert norm 4.118263
w2v_emb norm 2.4781713
fman
bert norm 3.4456747
w2v_emb norm 3.8834944
theoretically
bert norm 3.2143254
w2v_emb norm 4.997637
distraught
bert norm 3.3745742
w2v_emb norm 5.4683104
staircase
bert norm 3.761791
w2v_emb norm 5.295771
expel
bert norm 3.7248123
w2v_emb norm 5.135461
lord
bert norm 3.7209077
w2v_emb norm 5.1875114
behaviours
bert norm 3.895608
w2v_emb norm 5.205453
prescribing
bert norm 3.7141035
w2v_emb norm 3.9918427
Newly
bert norm 3.499971
w2v_emb norm 5.019069
patiently
bert norm 3.5460923
w2v_emb norm 5.0987344
skyline
bert norm 3.590108
w2v_emb norm 4.7948785
repertoire
bert norm 3.5841422
w2v_emb norm 5.1721573
hover
bert norm 4.185877
w2v_emb norm 5.611833
mint
bert norm 3.6458154
w2v_emb norm 5.285466
clears
bert norm 3.8598495
w2v_emb norm 5.9384356
kale
bert norm 3.7920215
w2v_emb norm 6.163053
Sco
bert norm 3.8120244
w2v_emb norm 4.767501
Coulter
bert

bert norm 3.7579594
w2v_emb norm 6.6754575
redirect
bert norm 3.5879784
w2v_emb norm 5.6204944
derogatory
bert norm 3.89113
w2v_emb norm 5.6479354
lateral
bert norm 3.8244166
w2v_emb norm 5.657072
ENTITY/495
bert norm 3.8202746
w2v_emb norm 5.230068
rolley
bert norm 4.116686
w2v_emb norm 5.049831
brew
bert norm 3.4158251
w2v_emb norm 6.4502335
babys
bert norm 3.7153077
w2v_emb norm 5.8824806
muff
bert norm 3.8792815
w2v_emb norm 5.655017
dime
bert norm 3.267987
w2v_emb norm 4.622881
wonderfully
bert norm 3.3646986
w2v_emb norm 5.1495376
treasures
bert norm 3.7587078
w2v_emb norm 6.154352
NES
bert norm 3.657565
w2v_emb norm 5.3390746
ponds
bert norm 3.727563
w2v_emb norm 5.421602
impulse
bert norm 3.5602727
w2v_emb norm 5.099479
detecting
bert norm 3.4038682
w2v_emb norm 5.9253273
grin
bert norm 3.802254
w2v_emb norm 6.59543
brid
bert norm 3.2220292
w2v_emb norm 5.428047
shoved
bert norm 3.6379898
w2v_emb norm 5.5902395
purge
bert norm 3.9829848
w2v_emb norm 3.099629
OTHER
bert norm 3.9

bert norm 3.6892128
w2v_emb norm 4.9765773
387
bert norm 3.8226867
w2v_emb norm 5.051731
productive
bert norm 3.766437
w2v_emb norm 4.086909
NEED
bert norm 4.00518
w2v_emb norm 4.4774976
minus
bert norm 3.7591462
w2v_emb norm 4.6994967
Pages
bert norm 3.9063876
w2v_emb norm 6.741299
cand
bert norm 3.7830057
w2v_emb norm 11.13398
ENTITY/Clover
bert norm 3.972165
w2v_emb norm 5.187622
Forensic
bert norm 3.967772
w2v_emb norm 7.1658397
ryn
bert norm 4.268625
w2v_emb norm 5.930155
ogle
bert norm 4.0112762
w2v_emb norm 6.4032702
ocr
bert norm 3.674334
w2v_emb norm 5.817058
vaccinations
bert norm 4.0532355
w2v_emb norm 5.8025846
cies
bert norm 3.9716613
w2v_emb norm 6.294643
Mek
bert norm 3.7884989
w2v_emb norm 4.483973
unaffected
bert norm 4.028434
w2v_emb norm 6.081608
fetal
bert norm 3.82662
w2v_emb norm 5.5370374
Dino
bert norm 3.998308
w2v_emb norm 5.5088544
hemisphere
bert norm 3.6459954
w2v_emb norm 5.112826
froze
bert norm 3.9822342
w2v_emb norm 5.0882945
Peg
bert norm 3.7687984
w2v_

bert norm 3.5972877
w2v_emb norm 4.8294888
318
bert norm 3.6485388
w2v_emb norm 5.0982924
Verge
bert norm 3.873565
w2v_emb norm 11.053531
ENTITY/Fin
bert norm 3.705043
w2v_emb norm 4.8850145
Mighty
bert norm 3.687136
w2v_emb norm 5.5621986
ENTITY/403
bert norm 3.7813687
w2v_emb norm 5.027909
bass
bert norm 3.9023762
w2v_emb norm 4.5929184
nice
bert norm 3.8805194
w2v_emb norm 4.9912853
sinks
bert norm 4.0231953
w2v_emb norm 4.9863153
Laugh
bert norm 3.620134
w2v_emb norm 5.0499935
367
bert norm 4.1362576
w2v_emb norm 6.5258327
Zur
bert norm 3.5373275
w2v_emb norm 5.007185
travers
bert norm 3.656322
w2v_emb norm 5.0301814
Mystery
bert norm 3.7625673
w2v_emb norm 10.349939
ENTITY/Monarch
bert norm 3.5153825
w2v_emb norm 5.241556
leapt
bert norm 4.100127
w2v_emb norm 4.1237903
ergy
bert norm 3.6558359
w2v_emb norm 2.6403048
porate
bert norm 3.8916972
w2v_emb norm 4.507589
display
bert norm 4.1848283
w2v_emb norm 5.237719
ilet
bert norm 3.5466096
w2v_emb norm 6.97637
endemic
bert norm 3.74

bert norm 3.7808022
w2v_emb norm 10.7384
ENTITY/Ether
bert norm 3.9476855
w2v_emb norm 6.5198913
proportional
bert norm 3.9114342
w2v_emb norm 5.377334
laund
bert norm 3.8860075
w2v_emb norm 10.014243
ENTITY/Rye
bert norm 3.7276278
w2v_emb norm 4.5570264
ambiguity
bert norm 3.6709204
w2v_emb norm 4.7164397
Terror
bert norm 3.7679164
w2v_emb norm 4.340152
Improved
bert norm 3.8882182
w2v_emb norm 5.748187
cooker
bert norm 4.200648
w2v_emb norm 5.8681226
elsen
bert norm 3.7074506
w2v_emb norm 5.290023
guerrilla
bert norm 4.0529065
w2v_emb norm 3.5203323
ATURE
bert norm 3.7741811
w2v_emb norm 4.9685435
unprepared
bert norm 3.8974526
w2v_emb norm 5.196015
camel
bert norm 3.9937124
w2v_emb norm 5.999382
fitt
bert norm 3.766492
w2v_emb norm 10.908274
ENTITY/Sex
bert norm 4.0403395
w2v_emb norm 5.423505
edged
bert norm 3.735171
w2v_emb norm 4.9154615
recurrent
bert norm 3.7623127
w2v_emb norm 3.9674852
Compare
bert norm 4.129556
w2v_emb norm 4.572107
Serving
bert norm 3.7054007
w2v_emb norm 5

bert norm 3.7223756
w2v_emb norm 5.078566
wallpaper
bert norm 3.8876483
w2v_emb norm 4.7625747
nurs
bert norm 3.6157112
w2v_emb norm 5.297506
subset
bert norm 3.8279247
w2v_emb norm 6.067047
ENTITY/703
bert norm 3.5511227
w2v_emb norm 4.637277
symbolism
bert norm 3.619854
w2v_emb norm 5.540793
dudes
bert norm 3.9017293
w2v_emb norm 5.391079
mismatch
bert norm 3.96451
w2v_emb norm 5.2698145
gans
bert norm 3.921666
w2v_emb norm 5.181736
please
bert norm 3.893823
w2v_emb norm 5.9340525
KE
bert norm 3.765894
w2v_emb norm 5.746739
atom
bert norm 3.7712967
w2v_emb norm 5.6440053
004
bert norm 4.1254573
w2v_emb norm 6.4911604
ionic
bert norm 3.4333851
w2v_emb norm 6.1668115
servings
bert norm 3.9930625
w2v_emb norm 5.423859
proxies
bert norm 3.963564
w2v_emb norm 5.935285
transcription
bert norm 4.044423
w2v_emb norm 6.509101
yx
bert norm 3.938193
w2v_emb norm 6.2072244
bowl
bert norm 3.8657258
w2v_emb norm 5.8600616
Scotch
bert norm 3.8853843
w2v_emb norm 5.2636795
brace
bert norm 4.0004025


bert norm 3.4316356
w2v_emb norm 5.297156
Hillary
bert norm 4.044502
w2v_emb norm 5.2737327
TAM
bert norm 3.8558712
w2v_emb norm 6.7044334
Hist
bert norm 3.7974312
w2v_emb norm 3.7055147
mechan
bert norm 3.8043737
w2v_emb norm 4.722522
Robots
bert norm 3.9972136
w2v_emb norm 4.715336
Leader
bert norm 3.8110487
w2v_emb norm 5.9255743
cartridges
bert norm 3.864079
w2v_emb norm 5.6605744
whistleblowers
bert norm 4.0550785
w2v_emb norm 6.5925016
SPL
bert norm 3.774537
w2v_emb norm 5.4204426
Labour
bert norm 4.023168
w2v_emb norm 6.5994005
unction
bert norm 3.9228985
w2v_emb norm 5.037591
faithfully
bert norm 3.729382
w2v_emb norm 5.1573915
coarse
bert norm 3.71381
w2v_emb norm 5.4790373
synth
bert norm 3.8228977
w2v_emb norm 5.5138636
LV
bert norm 3.7839172
w2v_emb norm 4.2472577
justifying
bert norm 3.6660097
w2v_emb norm 6.0813603
ENTITY/439
bert norm 3.554841
w2v_emb norm 4.851659
Victoria
bert norm 3.9718559
w2v_emb norm 9.34807
ENTITY/Proceedings
bert norm 3.615337
w2v_emb norm 5.1101

bert norm 3.5056186
w2v_emb norm 6.5717297
ENTITY/591
bert norm 3.7759745
w2v_emb norm 5.0681853
Pick
bert norm 3.6752605
w2v_emb norm 5.9791603
chords
bert norm 3.9430764
w2v_emb norm 9.493015
ENTITY/Hound
bert norm 3.9063506
w2v_emb norm 4.2264757
faces
bert norm 3.8756068
w2v_emb norm 5.036862
Yin
bert norm 4.287512
w2v_emb norm 6.8036056
ugi
bert norm 3.9796379
w2v_emb norm 5.066398
bows
bert norm 4.023266
w2v_emb norm 4.3774586
Forms
bert norm 3.722518
w2v_emb norm 7.5339785
ENTITY/886
bert norm 3.8785095
w2v_emb norm 11.317399
ENTITY/Ox
bert norm 3.7228956
w2v_emb norm 5.4382896
ENTITY/351
bert norm 3.6614187
w2v_emb norm 5.928186
mating
bert norm 3.8557403
w2v_emb norm 6.4185114
ENTITY/916
bert norm 3.8066604
w2v_emb norm 6.399634
expend
bert norm 3.724344
w2v_emb norm 4.627803
usefulness
bert norm 3.7232106
w2v_emb norm 5.938593
Marvel
bert norm 4.0122013
w2v_emb norm 4.35184
Stretch
bert norm 3.796949
w2v_emb norm 5.835443
JS
bert norm 3.839561
w2v_emb norm 5.085277
Hal
bert n

bert norm 3.8969297
w2v_emb norm 7.0757365
uart
bert norm 3.9265428
w2v_emb norm 5.1844935
Armory
bert norm 3.9413037
w2v_emb norm 5.0843563
orange
bert norm 3.8411572
w2v_emb norm 5.7104053
physiology
bert norm 3.8650837
w2v_emb norm 5.727416
Ut
bert norm 3.8970485
w2v_emb norm 6.5681624
parchment
bert norm 3.9621038
w2v_emb norm 5.0286665
Fired
bert norm 4.013361
w2v_emb norm 4.736895
trap
bert norm 4.069164
w2v_emb norm 2.9127264
mson
bert norm 3.9560723
w2v_emb norm 10.7925205
ENTITY/Poster
bert norm 3.7422829
w2v_emb norm 3.6872268
bount
bert norm 3.8693628
w2v_emb norm 4.863026
import
bert norm 3.9806151
w2v_emb norm 5.1215754
maximum
bert norm 3.6820414
w2v_emb norm 5.4547706
422
bert norm 3.7010262
w2v_emb norm 6.220102
nodding
bert norm 3.5604796
w2v_emb norm 5.3902307
inscription
bert norm 3.8067539
w2v_emb norm 4.507754
Results
bert norm 4.135841
w2v_emb norm 6.9791102
GRE
bert norm 3.5438728
w2v_emb norm 5.633056
cognition
bert norm 3.50688
w2v_emb norm 6.203584
ions
bert n

bert norm 3.8790843
w2v_emb norm 5.5442963
655
bert norm 3.9229536
w2v_emb norm 4.7545185
Conversion
bert norm 3.8840618
w2v_emb norm 5.875844
mL
bert norm 3.921146
w2v_emb norm 11.320548
ENTITY/Border
bert norm 3.7721298
w2v_emb norm 6.6544747
ENTITY/Ë
bert norm 3.945993
w2v_emb norm 5.4267526
Factor
bert norm 3.536741
w2v_emb norm 11.049405
ENTITY/Number
bert norm 3.687183
w2v_emb norm 2.4245598
ejac
bert norm 3.8746474
w2v_emb norm 5.614219
Cho
bert norm 3.7531042
w2v_emb norm 5.3889093
righteousness
bert norm 4.0394363
w2v_emb norm 4.486887
PATH
bert norm 3.8800035
w2v_emb norm 3.9434075
Elys
bert norm 3.9378026
w2v_emb norm 5.4608192
faculties
bert norm 3.778393
w2v_emb norm 10.874105
ENTITY/Earthquake
bert norm 3.8663201
w2v_emb norm 4.184355
References
bert norm 3.935175
w2v_emb norm 5.8514457
buff
bert norm 3.6170866
w2v_emb norm 13.909046
ENTITY/1895
bert norm 3.7571752
w2v_emb norm 6.325309
colo
bert norm 4.061725
w2v_emb norm 8.862666
ENTITY/Vi
bert norm 3.9029465
w2v_emb no

bert norm 3.8311903
w2v_emb norm 5.9206862
Gors
bert norm 3.815516
w2v_emb norm 11.720044
ENTITY/Duel
bert norm 3.923306
w2v_emb norm 5.4027314
admins
bert norm 3.9394686
w2v_emb norm 6.948442
ENTITY/Flor
bert norm 3.9410887
w2v_emb norm 9.43142
ENTITY/Deus
bert norm 4.130965
w2v_emb norm 6.2160206
cham
bert norm 3.7506492
w2v_emb norm 5.0317516
Rails
bert norm 3.9303875
w2v_emb norm 5.4977055
ceptor
bert norm 4.1352997
w2v_emb norm 3.4696736
naire
bert norm 3.9739335
w2v_emb norm 9.747762
ENTITY/Squid
bert norm 3.899261
w2v_emb norm 10.352557
ENTITY/Warranty
bert norm 3.7844043
w2v_emb norm 5.493911
SPEC
bert norm 3.9679778
w2v_emb norm 6.5837884
ensis
bert norm 4.07436
w2v_emb norm 4.424245
FUN
bert norm 3.7491865
w2v_emb norm 5.1682334
stellar
bert norm 3.8248994
w2v_emb norm 4.372141
Select
bert norm 3.8815868
w2v_emb norm 3.3677714
arget
bert norm 3.7910893
w2v_emb norm 10.012594
ENTITY/Uncharted
bert norm 3.754923
w2v_emb norm 4.281964
Details
bert norm 3.778931
w2v_emb norm 6.03

bert norm 3.8062665
w2v_emb norm 4.499876
arthy
bert norm 3.9228878
w2v_emb norm 10.211814
ENTITY/Psychic
bert norm 3.8006036
w2v_emb norm 7.015316
dorsal
bert norm 3.678271
w2v_emb norm 2.3675685
cember
bert norm 3.6922946
w2v_emb norm 6.2371497
joice
bert norm 3.909689
w2v_emb norm 3.849451
uint
bert norm 3.8510156
w2v_emb norm 3.9718673
derog
bert norm 4.142574
w2v_emb norm 3.8294382
Subject
bert norm 3.9405663
w2v_emb norm 3.7278402
hemat
bert norm 3.9922662
w2v_emb norm 5.6332207
meshes
bert norm 3.8269312
w2v_emb norm 5.7795725
Terran
bert norm 4.0787387
w2v_emb norm 5.040955
Load
bert norm 3.6705363
w2v_emb norm 5.2660456
goblins
bert norm 3.918046
w2v_emb norm 4.605116
Shattered
bert norm 3.8831456
w2v_emb norm 4.9881554
tests
bert norm 4.115881
w2v_emb norm 5.0853877
Spread
bert norm 3.7783196
w2v_emb norm 9.015059
ENTITY/Naruto
bert norm 3.5713556
w2v_emb norm 2.5441256
predic
bert norm 3.9297967
w2v_emb norm 5.420382
Hyp
bert norm 3.9164755
w2v_emb norm 8.390921
ENTITY/Arkha

bert norm 3.6811836
w2v_emb norm 6.8010654
integer
bert norm 3.8121836
w2v_emb norm 6.2708287
regex
bert norm 3.0098765
w2v_emb norm 6.5504246
nomine
bert norm 3.1529586
w2v_emb norm 5.777616
subparagraph
bert norm 4.0332465
w2v_emb norm 5.9146333
Header
bert norm 3.8797193
w2v_emb norm 5.241732
Spawn
bert norm 3.8609357
w2v_emb norm 5.9294143
toggle
bert norm 3.641218
w2v_emb norm 4.974119
Abyss
bert norm 3.5815713
w2v_emb norm 6.1967025
expr
bert norm 3.80759
w2v_emb norm 6.4719214
Zerg
bert norm 3.6652973
w2v_emb norm 9.214765
ENTITY/Grimoire
bert norm 4.194874
w2v_emb norm 4.532595
Contents
bert norm 3.762523
w2v_emb norm 3.0809548
Instance
bert norm 3.324495
w2v_emb norm 8.049925
cyclopedia
bert norm 3.3646529
w2v_emb norm 5.5112553
Takeru
bert norm 3.2571828
w2v_emb norm 6.764012
rgb
bert norm 3.1371999
w2v_emb norm 6.4929523
htt
bert norm 2.679919
w2v_emb norm 4.8394094
bryce
bert norm 2.9951315
w2v_emb norm 3.593342
livest
bert norm 3.3725355
w2v_emb norm 5.4912477
Annotations


45451

In [15]:
filename = 'data/mapping_train_data_entities_roberta.jsonl'
with open(filename, "w", encoding='utf-8') as fp:
        for row in mapping_train_data:
            fp.write(json.dumps(row) + "\n")

In [56]:
6.3212e-01

0.63212

In [63]:
2.3254e-01

0.23254

In [65]:
8.5548e-02

0.085548

In [66]:
5.7642e-04

0.00057642

In [67]:
one = torch.nn.Softmax()(torch.flip(torch.arange(1, 1000, 1), dims=[0]).float())

  one = torch.nn.Softmax()(torch.flip(torch.arange(1, 1000, 1), dims=[0]).float())


In [69]:
one.unsqueeze(dim=0).expand(128, -1).shape

torch.Size([128, 999])

In [81]:
with open("./aggregations.json") as f:
    aggregations = json.load(f)

In [82]:
for s in aggregations:
    for rel in aggregations[s]:
        if len(aggregations[s][rel]) == 1:
            continue
        
        ids = tokenizer.convert_tokens_to_ids(aggregations[s][rel])
        bert_average = torch.mean(torch.stack([i_embeddings[i] for i in ids], dim=1), dim=1)
        w2v_average = torch.mean(torch.stack([torch.tensor(entities[common_vocab[i]]) if common_vocab[i] in entities else torch.tensor(wordpiece[common_vocab[i].lower()]) for i in ids], dim=1), dim=1)
        data_point={'word':", ".join(aggregations[s][rel]), 'wikipedia2vec':w2v_average, 'bert': bert_average}
        mapping_train_data.append(data_point)

len(mapping_train_data)

23178

In [83]:
[w['word'] for w in mapping_train_data[22143:]]

['Syria, Bulgaria, Armenia',
 'Russia, Brazil, Australia, Germany, Mongolia, Canada, Greece, Italy',
 'Zambia, Angola',
 'Australia, Germany, Canada',
 'NATO, UNESCO',
 'Bavaria, Prussia',
 'Czechoslovakia, Switzerland, Poland, Luxembourg, Belgium',
 'Mali, Singapore, Cuba, Norway, Paraguay, Angola, Thailand, Azerbaijan, Botswana, Guyana, Peru, Cameroon, Jordan, Kenya, Australia, Ghana, Belgium, Bulgaria, Uganda, Iran, Algeria, Zambia, Hungary, Ireland, Turkey, Sweden, Uzbekistan, Benin, Serbia, Belarus, Bangladesh, Switzerland, Honduras, Indonesia, Israel, Madagascar, Somalia, Panama, Bhutan, Poland, Guatemala, Iraq, Colombia, Japan, Ukraine, Tunisia, Cyprus, Uruguay, Gabon, Rwanda, Slovenia, Mozambique, Ethiopia, Greece, Chad, Belize, Denmark, Chile, Brazil, Mongolia, Kazakhstan, Argentina, Mexico, Portugal, Finland, Bolivia, Italy, Albania, Moldova, Venezuela, Namibia, Senegal, Spain, Austria, Malta, Netherlands, Croatia, Luxembourg, Zimbabwe, Monaco, Egypt, Libya',
 'Europe, NATO',

In [84]:
filename = 'data/mapping_train_data_entites_words_averages.jsonl'
with open(filename, "w", encoding='utf-8') as fp:
        for row in mapping_train_data:
            fp.write(json.dumps(row) + "\n")

TypeError: Object of type Tensor is not JSON serializable