In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
import datasets

import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
from nltk.corpus import stopwords
import numpy as np
import random

import pandas as pd
import pathlib

import string
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm

### Package options ###
torch.set_printoptions(threshold=10_000)

[nltk_data] Downloading package punkt to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
### Utils ###
def plot_heatmap(x: np.array, labels: np.array) -> None:
    plt.figure(figsize=(15, 15))
    sns.heatmap(
        x,
        linewidth=0.5,
        xticklabels=labels,
        yticklabels=labels,
        annot=True,
        fmt='.2f',
    )
    plt.title(f'Self-attention matrix: layer {layer}, head {head}', fontsize=15)
    
    plt.show()
    return

def average_out_and_remove_rows(t: torch.tensor, averages_idx, remove_idx):
    for average_idx in averages_idx:  # The nested lists can have different dimensions.
        # Replace the attention scores of the first token with the average of the token attention scores.
        t[min(average_idx)] = torch.mean(t[average_idx], dim=0, keepdim=True)
    return t[~remove_idx]


def preprocess_attention_scores(attention_scores, averages_idx, remove_idx):
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    attention_scores = attention_scores.transpose(0, 1)
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    return attention_scores.transpose(0, 1)
        
    

def backward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    in_nodes = np.array([edge[0] for edge in list(G.in_edges(current_node))])
    in_nodes = in_nodes[(in_nodes > left_edge) & (in_nodes < current_node)]
    for node in in_nodes:
        sequence[node] = 1
        sequences.append(sequence)
        mean += G[node][current_node]['weight']
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, left_edge, node, sequence.copy(), mean)
        forward_pass(G, node, left_edge, current_node, sequence.copy(), mean)
        
    return
    
    
def forward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    out_nodes = np.array([edge[1] for edge in list(G.out_edges(current_node))])
    out_nodes = out_nodes[(out_nodes > current_node) & (out_nodes < right_edge)]
    for node in out_nodes:
        sequence[node] = 1
        mean += G[current_node][node]['weight']
        sequences.append(sequence)
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, current_node, node, sequence.copy(), mean)
        forward_pass(G, node, node, right_edge, sequence.copy(), mean)
            
    return
    

def construct_sequences(G: nx.DiGraph, n_tokens):
    for node in G.nodes():
        sequence = np.zeros(n_tokens)
        mean = 0
        sequence[node] = 1
        #sequences.append(sequence) # Do not allow for 1-token sequences.
        forward_pass(G, node, node, n_tokens, sequence.copy(), mean)

In [4]:
# Load Wikipedia dataset.
# TODO: Split between server and local.
#wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']
wiki_dataset = datasets.load_dataset(
    "wikipedia",
    "20220301.en",
    cache_dir="/nfs/data/projects/daniela")['train']

Found cached dataset wikipedia (/nfs/data/projects/daniela/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seed.
utils.fix_seed(41)

Using seed: 41

In [6]:
# Set DSDM hyperparameters.
address_size = 1000
ema_time_period = 7000 # 500
learning_rate_update = 0.5

temperature = 0.05

normalize = False

#chunk_sizes = [5]

prune_mode = "fixed-size"
max_size_address_space = 3000
chunk_score_threshold = 0.9

In [7]:
cleanup = cleanup.Cleanup(address_size)

In [8]:
model_name = "bert-base-uncased"  # Has 12 layers
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

MAXIMUM_SEQUENCE_LENGTH = 512

In [9]:
# Initialize memory.
memory = DSDM.DSDM(
    address_size=address_size,
    ema_time_period=ema_time_period,
    learning_rate_update=learning_rate_update,
    temperature=temperature,
    normalize=normalize,
    prune_mode=prune_mode,
    max_size_address_space=max_size_address_space,
    chunk_score_threshold=chunk_score_threshold,
) 

In [10]:
# Construct train set (texts) and inference set (sentences; in and out of train set text).
train_size = 10
test_size = 10

# Text indeces.
train_idx = np.random.randint(0, len(wiki_dataset), size=train_size)
#train_idx = np.append(np.append(np.append(train_idx[0], train_idx[0]), train_idx[0]), train_idx[0]) 

# Caclulate chosen text statistics.
# TODO

# Text indeces from which we extract sentences.
intest_idx = np.random.choice(train_idx, test_size)
outtest_idx = np.random.choice(np.setdiff1d(np.arange(len(wiki_dataset)), train_idx), test_size)

In [11]:
inference_sentences_in = []
inference_sentences_out = []

for idx_in, idx_out in zip(intest_idx, outtest_idx):
    # Get sentences.
    sentences_in = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_in)]['text'])
    sentences_out = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_out)]['text'])
    
    # Get sentence index.
    sentence_idx_in = int(
        np.random.randint(
            0,
            len(sentences_in),
            size=1
        )
    )
    sentence_idx_out = int(
        np.random.randint(
            0,
            len(sentences_out),
            size=1
        )
    )

    # Append sentence to list.
    inference_sentences_in.append(sentences_in[sentence_idx_in])
    inference_sentences_out.append(sentences_out[sentence_idx_out])

In [12]:
dups_found = 0

def remove_duplicates(memory):
    global dups_found
    global_keep_mask = torch.tensor([True] * len(memory.addresses)).to(device)
    
    for idx, address in enumerate(memory.addresses):
        if global_keep_mask[idx].item():
            cos = torch.nn.CosineSimilarity()
            keep_mask = cos(memory.addresses, address) < 0.95
            # Keep current address
            keep_mask[idx] = True
            global_keep_mask &= keep_mask

    if global_keep_mask.sum().item() > 0:
        dups_found += 1
        # Remove similar addresses
        memory.addresses = memory.addresses[global_keep_mask]
        # Remove bins
        memory.bins = memory.bins[global_keep_mask]
        # Remove chunk scores
        memory.chunk_scores = memory.chunk_scores[global_keep_mask]

In [13]:
# Training
for pos, i in enumerate(tqdm(train_idx)):
    text = wiki_dataset[int(i)]['text']
    
    # Preprocess data. 
    sentences = preprocess.split_text_into_sentences(text)
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt")
        if inputs['input_ids'].shape[1] > MAXIMUM_SEQUENCE_LENGTH:
            break
        
        outputs = model(**inputs, output_attentions=True)
        attention_matrix = outputs.attentions
        
        encoding = tokenizer.encode(sentence)
        labels = tokenizer.convert_ids_to_tokens(encoding)

        i = 0
        averages_idx = []
        while i < len(labels) - 1:
            j = i + 1
            average_idx = []
            while labels[j].startswith('#'):
                average_idx.append(j)
                labels[i] += labels[j].replace('#', '')
                j += 1
            if average_idx != []:
                average_idx.append(i)
                averages_idx.append(average_idx)
            i = j
        
        hashtag_idx = np.array([label.startswith("#") for label in labels])
        stopwords_idx = np.array([label in stopwords.words('english') for label in labels])
        punctuation_idx = np.array([label in string.punctuation for label in labels])
        remove_idx = hashtag_idx | punctuation_idx | stopwords_idx
        labels = np.array(labels)[~remove_idx]
        labels = labels[1:(len(labels) - 1)]

        layer = 0
        for head in range(12):
            head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()
        
            head_scores_raw_tensor = preprocess_attention_scores(head_scores_raw_tensor, averages_idx, remove_idx)
            
            head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()
            
            head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()
        
            as_threshold = 0.5
            head_scores[head_scores < as_threshold] = 0
            
            G = nx.from_numpy_array(head_scores, create_using = nx.DiGraph())
        
            sequences = []
            means = []
            n_tokens = len(labels)
            construct_sequences(G, n_tokens)
                
            df = pd.DataFrame(data=[sequences, means]).T.rename(columns={0: 'seq',  1: 'score'})
            
            if len(df) > 0:
                df['len'] = df['seq'].map(sum)
                df['score'] = df['score'].astype('float64')
                df = df.sort_values(by=['score', 'len'], ascending=[False, False]).reset_index(drop=True)
                top3_df = df.head(3)
            
                for i in range(len(top3_df)):
                    memory.save(
                        inference.generate_query(
                            address_size,
                            cleanup,
                            labels[top3_df['seq'][i].astype(bool)]
                        ),
                        top3_df['score'][i]
                    )
        memory.prune()
#     if (pos + 1) % 50 == 0:
#         remove_duplicates(memory)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [04:55<00:00, 29.55s/it]


In [14]:
#inference_sentences_in = ["The Society convenes an annual conference, in locations across the United States and in Canada, usually in June, to convey the James Alice award."]
inference_sentences_in = ["Deputy director flys to the United States."]

In [15]:
len(memory.addresses)

314

In [16]:
retrieve_mode = "top_k"

# Get table with token similarities for each "out-of-train" sentence.
retrieved_contents = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    inference_sentences_in,
    retrieve_mode=retrieve_mode,
    k=7, #TODO: What if index is out of range?
)

if retrieve_mode == "top_k":
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
    
    for s, addresses in zip(inference_sentences_in, retrieved_contents):
        display(s)
        out_tables = []
        for a in addresses:
            address_sims_df = inference.get_similarities_to_atomic_set(
                a, cleanup)
            out = widgets.Output()
            with out:
                display(address_sims_df)
            out_tables.append(out)
        display(widgets.HBox(out_tables))
elif retrieve_mode == "pooling":  
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
      
    for s, c in zip(inference_sentences_in, retrieved_contents):
        sentence_sims_df = inference.get_similarities_to_atomic_set(
            c, cleanup)
        sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
        sims_df = pd.concat([sims_df, sentence_sims_df])

    sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
                     .set_index(['sentence', 'token'])
    
    display(sims_df)
else:  # unrecognized
    pass

'Deputy director flys to the United States.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

In [17]:
len(memory.addresses)

314

In [18]:
memory.prune()
addresses = np.random.randint(0, len(memory.addresses), size=70)
#addresses = np.argwhere((memory.chunk_scores > 0.97).cpu().detach().numpy().flatten()).flatten()
#addresses = np.argwhere((memory.bins > 50).cpu().detach().numpy().flatten()).flatten()

for address in np.arange(0, len(memory.addresses)):
    display(md(f"### Address {address}"))
    address_sims_df = inference.get_similarities_to_atomic_set(
            memory.addresses[address],
            cleanup,
    )
    display(address_sims_df)

### Address 0

Unnamed: 0,token,similarity
0,name,0.70993
1,common,0.70993
2,goals,0.101419
3,people,0.084515
4,wounded,0.076064
5,anna,0.073247
6,clinic,0.073247
7,exams,0.07043
8,italian,0.067612
9,administration,0.067612


### Address 1

Unnamed: 0,token,similarity
0,snail,0.727324
1,sea,0.727324
2,shape,0.111367
3,issues,0.094868
4,trustees,0.094868
5,january,0.092119
6,youth,0.089369
7,1,0.089369
8,ended,0.078369
9,square,0.07287


### Address 2

Unnamed: 0,token,similarity
0,mollusk,0.705691
1,gastropod,0.705691
2,engagement,0.082189
3,’,0.079355
4,royal,0.079355
5,far,0.068018
6,player,0.068018
7,campaign,0.068018
8,free,0.068018
9,white,0.065184


### Address 3

Unnamed: 0,token,similarity
0,mm,0.69282
1,11,0.69282
2,business,0.075056
3,brown,0.072169
4,generally,0.069282
5,new,0.066395
6,impressed,0.063509
7,21st,0.060622
8,freetown,0.057735
9,organized,0.057735


### Address 4

Unnamed: 0,token,similarity
0,19,0.689202
1,mm,0.689202
2,professional,0.111723
3,battalion,0.094312
4,length,0.09141
5,financial,0.079802
6,freetown,0.073999
7,references,0.071097
8,economic,0.068195
9,uk,0.068195


### Address 5

Unnamed: 0,token,similarity
0,13,0.709225
1,mm,0.709225
2,various,0.111389
3,new,0.077549
4,championship,0.077549
5,president,0.074729
6,freetown,0.06627
7,100,0.06627
8,financial,0.06627
9,impressed,0.06627


### Address 6

Unnamed: 0,token,similarity
0,conical,0.691375
1,elongated,0.691375
2,australia,0.092569
3,youth,0.092569
4,premier,0.080998
5,saw,0.078105
6,would,0.075212
7,business,0.07232
8,commonwealth,0.066534
9,film,0.060748


### Address 7

Unnamed: 0,token,similarity
0,shape,0.608753
1,conical,0.580038
2,elongated,0.552472
3,would,0.08959
4,australia,0.082699
5,film,0.075807
6,premier,0.07351
7,commonwealth,0.072361
8,hampshire,0.071213
9,directed,0.070064


### Address 8

Unnamed: 0,token,similarity
0,solid,0.720417
1,somewhat,0.720417
2,awards,0.104106
3,ice,0.079121
4,championship,0.079121
5,generally,0.073569
6,whorls,0.068016
7,positions,0.068016
8,species,0.06524
9,football,0.06524


### Address 9

Unnamed: 0,token,similarity
0,stripes,0.714843
1,brown,0.714843
2,screenings,0.107716
3,july,0.07694
4,former,0.07694
5,’,0.074142
6,party,0.074142
7,university,0.068547
8,years,0.068547
9,little,0.068547


### Address 10

Unnamed: 0,token,similarity
0,stripes,0.597989
1,generally,0.573932
2,brown,0.573932
3,screenings,0.109975
4,people,0.0905
5,entity,0.083627
6,former,0.083627
7,garrison,0.076753
8,9,0.06988
9,university,0.067589


### Address 11

Unnamed: 0,token,similarity
0,stripes,0.52037
1,generally,0.518384
2,broken,0.498522
3,brown,0.476675
4,screenings,0.099307
5,entity,0.095335
6,people,0.085404
7,little,0.083418
8,university,0.079446
9,former,0.075474


### Address 12

Unnamed: 0,token,similarity
0,moderately,0.721803
1,impressed,0.721803
2,entity,0.087281
3,21,0.078969
4,ceo,0.073427
5,intact,0.073427
6,mollusk,0.073427
7,website,0.067886
8,screenings,0.067886
9,signatures,0.065115


### Address 13

Unnamed: 0,token,similarity
0,7,0.705691
1,whorls,0.705691
2,facility,0.085023
3,broken,0.085023
4,australia,0.076521
5,alberto,0.068018
6,list,0.065184
7,actually,0.06235
8,mayor,0.06235
9,center,0.059516


### Address 14

Unnamed: 0,token,similarity
0,less,0.702851
1,measures,0.702851
2,former,0.099594
3,anna,0.088212
4,within,0.085367
5,society,0.085367
6,square,0.07683
7,puerto,0.065448
8,health,0.062602
9,attacked,0.062602


Unnamed: 0,token,similarity
0,less,0.702851
1,measures,0.702851
2,former,0.099594
3,anna,0.088212
4,within,0.085367
5,society,0.085367
6,square,0.07683
7,puerto,0.065448
8,health,0.062602
9,attacked,0.062602


Unnamed: 0,token,similarity
0,total,0.720417
1,length,0.720417
2,independent,0.120763
3,served,0.098554
4,exams,0.084673
5,measures,0.076345
6,issues,0.073569
7,history,0.070792
8,alumni,0.06524
9,australia,0.062464


### Address 16

Unnamed: 0,token,similarity
0,within,0.686294
1,distance,0.686294
2,luciano,0.094712
3,18,0.091797
4,approximately,0.083055
5,inequality,0.080141
6,elections,0.074312
7,references,0.074312
8,positions,0.074312
9,external,0.074312


### Address 17

Unnamed: 0,token,similarity
0,little,0.578778
1,within,0.558575
2,distance,0.545502
3,18,0.08438
4,positions,0.079627
5,website,0.079627
6,luciano,0.07725
7,partition,0.074873
8,running,0.074873
9,sound,0.074873


### Address 18

Unnamed: 0,token,similarity
0,little,0.69857
1,distance,0.69857
2,partition,0.094479
3,species,0.088753
4,step,0.080164
5,iraq,0.071575
6,running,0.071575
7,regiment,0.065849
8,royal,0.062986
9,’,0.062986


### Address 19

Unnamed: 0,token,similarity
0,outer,0.707814
1,base,0.707814
2,professional,0.103135
3,street,0.097483
4,currently,0.083355
5,400,0.083355
6,rican,0.077704
7,mollusk,0.072053
8,decides,0.069227
9,winners,0.063576


### Address 20

Unnamed: 0,token,similarity
0,south,0.7
1,australia,0.7
2,elongated,0.111429
3,founded,0.077143
4,400,0.074286
5,positions,0.071429
6,mm,0.068571
7,examinations,0.065714
8,de,0.062857
9,high,0.062857


### Address 21

Unnamed: 0,token,similarity
0,marine,0.713442
1,species,0.713442
2,born,0.082698
3,annual,0.082698
4,olympics,0.068681
5,business,0.065878
6,whorls,0.060271
7,campaign,0.060271
8,june,0.060271
9,hospitals,0.057468


### Address 22

Unnamed: 0,token,similarity
0,links,0.712039
1,external,0.712039
2,everyday,0.085669
3,1946,0.082861
4,rican,0.082861
5,party,0.080052
6,puerto,0.077243
7,100,0.074434
8,ii,0.068816
9,mba,0.068816


### Address 23

Unnamed: 0,token,similarity
0,society,0.726636
1,royal,0.726636
2,diverse,0.09083
3,commerce,0.07982
4,leone,0.07982
5,step,0.071563
6,summer,0.06881
7,kingdom,0.066058
8,indian,0.066058
9,generally,0.063305


### Address 24

Unnamed: 0,token,similarity
0,world,0.705707
1,register,0.705675
2,candidate,0.119034
3,struggles,0.102027
4,government,0.08219
5,last,0.06802
6,79,0.068018
7,player,0.062351
8,businesses,0.062351
9,anna,0.06235


### Address 25

Unnamed: 0,token,similarity
0,label,0.69282
1,record,0.69282
2,ukrainian,0.109697
3,barriers,0.072169
4,battalion,0.069282
5,wide,0.069282
6,rico,0.066395
7,clinic,0.066395
8,4th,0.063509
9,british,0.063509


### Address 26

Unnamed: 0,token,similarity
0,based,0.598934
1,label,0.567715
2,record,0.563091
3,barriers,0.106374
4,ukrainian,0.098281
5,battalion,0.089031
6,neutral,0.077468
7,high,0.077468
8,register,0.074
9,april,0.071687


### Address 27

Unnamed: 0,token,similarity
0,italian,0.594385
1,label,0.582776
2,record,0.545627
3,massachusetts,0.090551
4,neutral,0.088229
5,barriers,0.087068
6,ukrainian,0.078942
7,wide,0.075459
8,away,0.073137
9,18th,0.073137


### Address 28

Unnamed: 0,token,similarity
0,soundtrack,0.707107
1,film,0.707107
2,inequality,0.093338
3,young,0.093338
4,breaking,0.082024
5,league,0.073539
6,economic,0.073539
7,high,0.070711
8,name,0.062225
9,broken,0.062225


### Address 29

Unnamed: 0,token,similarity
0,soundtrack,0.587478
1,issues,0.584496
2,film,0.57842
3,inequality,0.107457
4,young,0.096134
5,breaking,0.089217
6,second,0.079724
7,economic,0.078933
8,electoral,0.075226
9,name,0.07428


### Address 30

Unnamed: 0,token,similarity
0,soundtrack,0.715551
1,issues,0.715533
2,total,0.114598
3,electoral,0.086648
4,second,0.086648
5,breaking,0.078263
6,april,0.078262
7,chief,0.075466
8,ii,0.072673
9,england,0.072672


### Address 31

Unnamed: 0,token,similarity
0,labels,0.712741
1,record,0.712741
2,men,0.106631
3,clinic,0.086988
4,screenings,0.075764
5,21,0.075764
6,away,0.070152
7,boston,0.067346
8,life,0.067346
9,name,0.067346


### Address 32

Unnamed: 0,token,similarity
0,released,0.700714
1,79,0.700714
2,runners,0.098471
3,examinations,0.087054
4,register,0.0842
5,gonzalez,0.078491
6,government,0.072783
7,state,0.072783
8,external,0.067074
9,alumni,0.067074


### Address 33

Unnamed: 0,token,similarity
0,79,0.577385
1,released,0.544974
2,titles,0.543774
3,register,0.102033
4,directed,0.09243
5,drama,0.074424
6,examinations,0.072023
7,state,0.070823
8,mexican,0.068422
9,electoral,0.067222


### Address 34

Unnamed: 0,token,similarity
0,fernandez,0.722496
1,fernando,0.722496
2,mental,0.074741
3,2nd,0.071973
4,entity,0.071973
5,votes,0.071973
6,president,0.069205
7,hockey,0.069205
8,decides,0.069205
9,since,0.066436


### Address 35

Unnamed: 0,token,similarity
0,film,0.703562
1,directed,0.703562
2,competed,0.1066
3,inequality,0.1066
4,commission,0.081016
5,range,0.081016
6,titles,0.078174
7,recommendations,0.072488
8,shape,0.069646
9,society,0.069646


### Address 36

Unnamed: 0,token,similarity
0,film,0.599922
1,drama,0.590746
2,directed,0.552892
3,titles,0.105531
4,inequality,0.099796
5,competed,0.084884
6,government,0.075707
7,elongated,0.071119
8,recommendations,0.068825
9,board,0.066531


### Address 37

Unnamed: 0,token,similarity
0,torres,0.683374
1,fernando,0.683374
2,equity,0.103896
3,2017,0.077556
4,need,0.071703
5,far,0.068776
6,competed,0.06585
7,originally,0.059996
8,registered,0.059996
9,football,0.05707


### Address 38

Unnamed: 0,token,similarity
0,alberto,0.704982
1,gonzalez,0.704982
2,participation,0.08369
3,service,0.080853
4,79,0.069505
5,full,0.069505
6,real,0.063831
7,total,0.063831
8,men,0.060994
9,8,0.060994


### Address 39

Unnamed: 0,token,similarity
0,de,0.697145
1,luciano,0.697129
2,facility,0.106148
3,club,0.100411
4,organized,0.086065
5,full,0.07459
6,executives,0.065984
7,health,0.065984
8,distance,0.065984
9,5th,0.063115


### Address 40

Unnamed: 0,token,similarity
0,white,0.714143
1,black,0.714143
2,conical,0.086817
3,1,0.086817
4,acting,0.075615
5,label,0.072815
6,competed,0.067213
7,alberto,0.067213
8,business,0.064413
9,law,0.061612


### Address 41

Unnamed: 0,token,similarity
0,films,0.70214
1,white,0.70214
2,summer,0.095423
3,two,0.089726
4,kingdom,0.075484
5,maria,0.075484
6,debut,0.069787
7,sub,0.066938
8,18,0.06409
9,whorls,0.061241


### Address 42

Unnamed: 0,token,similarity
0,films,0.710634
1,mexican,0.710634
2,brown,0.091468
3,iihf,0.088653
4,sub,0.085839
5,two,0.068953
6,british,0.066138
7,shape,0.063324
8,may,0.063324
9,extension,0.060509


### Address 43

Unnamed: 0,token,similarity
0,football,0.727326
1,association,0.727322
2,puerto,0.083869
3,citizen,0.078369
4,marine,0.07562
5,2017,0.07012
6,torres,0.06737
7,impressed,0.06737
8,states,0.06737
9,color,0.061871


### Address 44

Unnamed: 0,token,similarity
0,association,0.645492
1,football,0.589431
2,club,0.561902
3,de,0.081667
4,school,0.070997
5,marine,0.068949
6,2017,0.066512
7,league,0.062856
8,puerto,0.06097
9,hampshire,0.06035


### Address 45

Unnamed: 0,token,similarity
0,football,0.614611
1,association,0.611378
2,italian,0.548602
3,marine,0.073994
4,citizen,0.070911
5,school,0.065435
6,black,0.063282
7,torres,0.062791
8,barriers,0.060252
9,would,0.060053


### Address 46

Unnamed: 0,token,similarity
0,last,0.694262
1,played,0.694262
2,university,0.106588
3,british,0.092184
4,administration,0.086423
5,rican,0.086423
6,film,0.0749
7,1,0.072019
8,barriers,0.069138
9,battalion,0.063377


### Address 47

Unnamed: 0,token,similarity
0,club,0.704982
1,history,0.704982
2,association,0.089364
3,player,0.08369
4,alumni,0.080853
5,luciano,0.078016
6,sound,0.075179
7,common,0.072342
8,outer,0.063831
9,sierra,0.060994


### Address 48

Unnamed: 0,token,similarity
0,club,0.717635
1,founded,0.717635
2,sound,0.079428
3,australia,0.079428
4,000,0.076641
5,based,0.071067
6,school,0.06828
7,remove,0.06828
8,luciano,0.062706
9,association,0.062706


### Address 49

Unnamed: 0,token,similarity
0,successive,0.710634
1,years,0.710634
2,partition,0.083025
3,dorchester,0.077396
4,uk,0.074581
5,trustees,0.068953
6,18,0.068953
7,remain,0.063324
8,2020,0.060509
9,moderately,0.060509


### Address 50

Unnamed: 0,token,similarity
0,4,0.707107
1,years,0.707107
2,18,0.096167
3,background,0.084853
4,directors,0.084853
5,kingdom,0.079196
6,campaign,0.076368
7,21st,0.076368
8,key,0.070711
9,ended,0.067882


### Address 51

Unnamed: 0,token,similarity
0,championships,0.69857
1,successive,0.69857
2,24,0.074438
3,stripes,0.068712
4,top,0.065849
5,released,0.065849
6,shape,0.065849
7,saw,0.062986
8,serie,0.060123
9,manager,0.060123


### Address 52

Unnamed: 0,token,similarity
0,league,0.709962
1,lowest,0.709897
2,director,0.0817
3,mission,0.081693
4,gave,0.078879
5,soundtrack,0.076065
6,name,0.076065
7,professional,0.076064
8,problems,0.070433
9,houses,0.067616


### Address 53

Unnamed: 0,token,similarity
0,4,0.712039
1,successive,0.712039
2,uk,0.082861
3,executives,0.077243
4,towards,0.071625
5,november,0.068816
6,21st,0.068816
7,states,0.066008
8,south,0.066008
9,group,0.066008


### Address 54

Unnamed: 0,token,similarity
0,runners,0.691376
1,winners,0.691376
2,facto,0.083891
3,league,0.083891
4,1,0.080998
5,background,0.075212
6,india,0.075212
7,record,0.069427
8,regiment,0.069427
9,certification,0.060748


### Address 55

Unnamed: 0,token,similarity
0,group,0.7
1,champions,0.7
2,partition,0.085714
3,commission,0.077143
4,housing,0.074286
5,football,0.071429
6,stripes,0.071429
7,regiment,0.068571
8,system,0.068571
9,ceo,0.065714


### Address 56

Unnamed: 0,token,similarity
0,ended,0.728011
1,second,0.728011
2,indian,0.090658
3,may,0.087911
4,april,0.085164
5,career,0.074175
6,role,0.071427
7,4,0.06868
8,people,0.065933
9,1,0.060439


### Address 57

Unnamed: 0,token,similarity
0,champions,0.712741
1,e,0.712741
2,gave,0.106631
3,belong,0.084182
4,approximately,0.075764
5,england,0.072958
6,broken,0.070152
7,within,0.067346
8,business,0.061733
9,diverse,0.061733


### Address 58

Unnamed: 0,token,similarity
0,serie,0.707814
1,return,0.707814
2,list,0.097483
3,struggles,0.094658
4,top,0.072053
5,social,0.072053
6,designated,0.066402
7,full,0.06075
8,soundtrack,0.057925
9,street,0.057925


### Address 59

Unnamed: 0,token,similarity
0,senior,0.72111
1,side,0.72111
2,saw,0.091526
3,citizen,0.080432
4,maria,0.080432
5,team,0.077658
6,international,0.074885
7,external,0.072111
8,directed,0.069338
9,central,0.066564


### Address 60

Unnamed: 0,token,similarity
0,football,0.709972
1,youth,0.709888
2,electoral,0.078882
3,mba,0.073245
4,iraq,0.073244
5,elongated,0.070426
6,puerto,0.064801
7,association,0.062061
8,serie,0.061978
9,full,0.061976


### Address 61

Unnamed: 0,token,similarity
0,colors,0.722496
1,team,0.722496
2,olympics,0.116264
3,2017,0.105191
4,actually,0.083045
5,less,0.080277
6,october,0.074741
7,vision,0.071973
8,2008,0.071973
9,sub,0.0609


### Address 62

Unnamed: 0,token,similarity
0,website,0.69642
1,official,0.69642
2,step,0.10195
3,dorchester,0.093335
4,certification,0.078975
5,winners,0.064616
6,13,0.064616
7,one,0.064616
8,moderately,0.064616
9,mexican,0.061744


### Address 63

Unnamed: 0,token,similarity
0,health,0.694315
1,center,0.694209
2,actually,0.089287
3,mm,0.074873
4,running,0.072045
5,decides,0.072001
6,7,0.071966
7,gave,0.069093
8,within,0.06628
9,british,0.066226


### Address 64

Unnamed: 0,token,similarity
0,born,0.697137
1,1958,0.697137
2,business,0.086066
3,annual,0.086066
4,signatures,0.080329
5,passed,0.07746
6,service,0.071722
7,leone,0.068853
8,de,0.068853
9,free,0.060246


### Address 65

Unnamed: 0,token,similarity
0,health,0.570295
1,center,0.556233
2,street,0.541756
3,commissioner,0.078934
4,base,0.073503
5,running,0.073495
6,within,0.067475
7,take,0.067142
8,three,0.06564
9,step,0.062966


### Address 66

Unnamed: 0,token,similarity
0,ceo,0.722496
1,president,0.722496
2,change,0.099655
3,common,0.080277
4,initial,0.077509
5,fernandez,0.077509
6,group,0.074741
7,exams,0.071973
8,free,0.066436
9,base,0.063668


### Address 67

Unnamed: 0,token,similarity
0,sierra,0.722501
1,leone,0.72249
2,currently,0.077506
3,exams,0.077503
4,goals,0.072007
5,belong,0.071963
6,partition,0.066479
7,iraq,0.063677
8,problems,0.060936
9,free,0.060923


### Address 68

Unnamed: 0,token,similarity
0,family,0.736893
1,background,0.736878
2,iraq,0.118066
3,runners,0.080065
4,wide,0.074639
5,current,0.069211
6,february,0.06921
7,21,0.06921
8,4,0.066495
9,need,0.063783


### Address 69

Unnamed: 0,token,similarity
0,sierra,0.628909
1,leone,0.627238
2,freetown,0.513375
3,partition,0.083686
4,goals,0.083298
5,problems,0.074978
6,various,0.07177
7,political,0.069433
8,free,0.06712
9,acting,0.065811


### Address 70

Unnamed: 0,token,similarity
0,leone,0.710079
1,freetown,0.709756
2,political,0.076136
3,june,0.075991
4,various,0.073366
5,chamber,0.073274
6,championship,0.070008
7,partition,0.067905
8,manager,0.067348
9,mental,0.064749


### Address 71

Unnamed: 0,token,similarity
0,ethnic,0.706399
1,group,0.706399
2,executive,0.080691
3,attacked,0.07786
4,extension,0.072197
5,maria,0.069366
6,2013,0.069366
7,directors,0.066535
8,ceo,0.066535
9,harvard,0.066535


### Address 72

Unnamed: 0,token,similarity
0,family,0.703562
1,belong,0.703562
2,iraq,0.115128
3,e,0.103758
4,need,0.098072
5,system,0.086702
6,4th,0.081016
7,house,0.078174
8,gave,0.072488
9,partition,0.069646


### Address 73

Unnamed: 0,token,similarity
0,manager,0.696419
1,general,0.696419
2,2012,0.093335
3,championships,0.081847
4,system,0.078975
5,79,0.076104
6,leone,0.076104
7,wounded,0.07036
8,february,0.07036
9,failing,0.067488


### Address 74

Unnamed: 0,token,similarity
0,manager,0.570833
1,deputy,0.561218
2,general,0.55553
3,championships,0.093676
4,institute,0.077895
5,gonzalez,0.077077
6,titles,0.06999
7,leone,0.065377
8,serie,0.064008
9,second,0.06396


### Address 75

Unnamed: 0,token,similarity
0,passed,0.693549
1,examinations,0.693535
2,1st,0.076419
3,79,0.076419
4,school,0.073537
5,officials,0.073535
6,iraq,0.064885
7,labels,0.064885
8,clinic,0.064884
9,e,0.062001


### Address 76

Unnamed: 0,token,similarity
0,school,0.699304
1,london,0.699267
2,primary,0.092952
3,mba,0.078653
4,3rd,0.078651
5,war,0.075792
6,candidates,0.075792
7,passed,0.070072
8,references,0.067212
9,england,0.067212


### Address 77

Unnamed: 0,token,similarity
0,united,0.68775
1,kingdom,0.68775
2,less,0.082879
3,2012,0.071247
4,rico,0.065431
5,co,0.062523
6,annual,0.059615
7,bigger,0.059615
8,foot,0.059615
9,club,0.056707


### Address 78

Unnamed: 0,token,similarity
0,business,0.712039
1,administration,0.712039
2,player,0.096905
3,1958,0.094096
4,role,0.082861
5,towards,0.074434
6,name,0.074434
7,commissioner,0.071625
8,time,0.068816
9,england,0.063199


### Address 79

Unnamed: 0,token,similarity
0,maria,0.69857
1,anna,0.69857
2,outer,0.077301
3,common,0.074438
4,including,0.065849
5,services,0.065849
6,intact,0.062986
7,bigger,0.062986
8,senior,0.060123
9,mission,0.060123


### Address 80

Unnamed: 0,token,similarity
0,extension,0.698565
1,school,0.698565
2,italian,0.065833
3,mba,0.065802
4,shape,0.065696
5,would,0.062939
6,citizen,0.057266
7,sound,0.057167
8,within,0.054387
9,services,0.054349


### Address 81

Unnamed: 0,token,similarity
0,exams,0.713442
1,passed,0.713442
2,school,0.093911
3,organizations,0.091108
4,far,0.088304
5,sierra,0.082698
6,iraq,0.082698
7,total,0.082698
8,ceo,0.079894
9,university,0.079894


### Address 82

Unnamed: 0,token,similarity
0,business,0.693542
1,management,0.693542
2,player,0.093722
3,titles,0.093722
4,commissioner,0.090838
5,screenings,0.079303
6,1,0.076419
7,following,0.076419
8,mm,0.073536
9,conical,0.067768


### Address 83

Unnamed: 0,token,similarity
0,school,0.713454
1,business,0.713431
2,1958,0.082697
3,association,0.077091
4,university,0.074288
5,titles,0.071484
6,marine,0.071483
7,mental,0.068681
8,candidates,0.065878
9,passed,0.065878


### Address 84

Unnamed: 0,token,similarity
0,university,0.544732
1,business,0.515494
2,school,0.513586
3,harvard,0.478504
4,played,0.082833
5,effects,0.075039
6,primary,0.075038
7,exams,0.075036
8,3rd,0.073087
9,1958,0.06724


### Address 85

Unnamed: 0,token,similarity
0,university,0.613686
1,school,0.595934
2,business,0.592601
3,shape,0.08878
4,passed,0.081011
5,played,0.081011
6,candidates,0.079902
7,exams,0.075462
8,3rd,0.074353
9,kingdom,0.067694


### Address 86

Unnamed: 0,token,similarity
0,management,0.703562
1,institute,0.703562
2,serie,0.078174
3,get,0.075331
4,player,0.072488
5,flys,0.072488
6,2nd,0.072488
7,2013,0.069646
8,financial,0.066803
9,screenings,0.066803


### Address 87

Unnamed: 0,token,similarity
0,london,0.713442
1,uk,0.713442
2,successive,0.085501
3,economic,0.082698
4,primary,0.074288
5,england,0.065878
6,stripes,0.057468
7,fifa,0.057468
8,kingdom,0.054665
9,000,0.054665


### Address 88

Unnamed: 0,token,similarity
0,hospitals,0.724569
1,various,0.724569
2,name,0.097989
3,label,0.081428
4,army,0.075907
5,june,0.075907
6,ice,0.075907
7,general,0.067626
8,24,0.067626
9,dorchester,0.062106


### Address 89

Unnamed: 0,token,similarity
0,health,0.681197
1,care,0.681154
2,less,0.093956
3,professional,0.082212
4,maria,0.079274
5,registered,0.073402
6,active,0.070464
7,management,0.06753
8,neutral,0.067529
9,acting,0.064593


### Address 90

Unnamed: 0,token,similarity
0,financial,0.719027
1,sound,0.719027
2,played,0.084837
3,businesses,0.076492
4,society,0.062585
5,new,0.059803
6,task,0.059803
7,extension,0.059803
8,served,0.057021
9,time,0.057021


### Address 91

Unnamed: 0,token,similarity
0,sound,0.599282
1,brought,0.591426
2,financial,0.591426
3,task,0.080802
4,designated,0.070702
5,chief,0.06958
6,health,0.068457
7,played,0.068457
8,deputy,0.066213
9,new,0.066213


### Address 92

Unnamed: 0,token,similarity
0,5,0.73212
1,000,0.73212
2,species,0.081954
3,october,0.081954
4,commission,0.079222
5,equity,0.07649
6,businesses,0.073758
7,director,0.073758
8,summer,0.073758
9,current,0.071027


### Address 93

Unnamed: 0,token,similarity
0,served,0.684105
1,people,0.684105
2,common,0.078935
3,championships,0.078935
4,public,0.076012
5,attacked,0.076012
6,generally,0.073088
7,10th,0.073088
8,community,0.073088
9,high,0.067241


### Address 94

Unnamed: 0,token,similarity
0,increased,0.596205
1,served,0.560996
2,people,0.546912
3,alumni,0.100932
4,public,0.092717
5,mm,0.083328
6,real,0.082154
7,community,0.080981
8,total,0.078633
9,generally,0.075112


### Address 95

Unnamed: 0,token,similarity
0,new,0.680447
1,england,0.680435
2,professional,0.095527
3,would,0.07789
4,24,0.074951
5,fernandez,0.072012
6,economic,0.069073
7,issues,0.066133
8,13,0.063195
9,rico,0.063194


### Address 96

Unnamed: 0,token,similarity
0,health,0.697937
1,screenings,0.697771
2,running,0.093149
3,2007,0.087413
4,stripes,0.084542
5,2008,0.078802
6,mm,0.075956
7,within,0.073086
8,senior,0.07308
9,step,0.073076


### Address 97

Unnamed: 0,token,similarity
0,foot,0.727324
1,square,0.727324
2,belong,0.078369
3,current,0.078369
4,january,0.07562
5,royal,0.07012
6,recommendations,0.06737
7,released,0.06462
8,hampshire,0.061871
9,gonzalez,0.061871


### Address 98

Unnamed: 0,token,similarity
0,health,0.716309
1,facility,0.716171
2,commissioner,0.093541
3,mayor,0.085166
4,de,0.085163
5,luciano,0.082376
6,professional,0.079586
7,daily,0.079582
8,website,0.071204
9,neutral,0.062827


### Address 99

Unnamed: 0,token,similarity
0,2017,0.69642
1,june,0.69642
2,part,0.093335
3,various,0.087591
4,torres,0.084719
5,uk,0.064616
6,need,0.064616
7,wounded,0.061744
8,organizations,0.061744
9,positions,0.058873


### Address 100

Unnamed: 0,token,similarity
0,’,0.699285
1,vision,0.699285
2,annual,0.075792
3,role,0.072932
4,5th,0.067211
5,first,0.064351
6,new,0.064351
7,garrison,0.064351
8,gastropod,0.061491
9,away,0.061491


### Address 101

Unnamed: 0,token,similarity
0,north,0.697137
1,dorchester,0.697137
2,run,0.083197
3,designated,0.07746
4,battalions,0.07746
5,21st,0.074591
6,social,0.071722
7,international,0.071722
8,team,0.068853
9,sierra,0.065984


### Address 102

Unnamed: 0,token,similarity
0,housing,0.71624
1,building,0.71624
2,exams,0.107506
3,group,0.087959
4,partition,0.085167
5,4th,0.082375
6,made,0.07679
7,1939,0.073998
8,deputy,0.073998
9,commonwealth,0.073998


### Address 103

Unnamed: 0,token,similarity
0,housing,0.592361
1,building,0.57735
2,senior,0.562339
3,group,0.099304
4,partition,0.087757
5,british,0.086603
6,approximately,0.07852
7,commonwealth,0.077365
8,exams,0.073901
9,ukrainian,0.069282


### Address 104

Unnamed: 0,token,similarity
0,high,0.717635
1,need,0.717635
2,alumni,0.087788
3,australia,0.087788
4,competed,0.082214
5,career,0.071067
6,parties,0.071067
7,team,0.06828
8,inequality,0.06828
9,elections,0.065493


### Address 105

Unnamed: 0,token,similarity
0,health,0.691413
1,clinic,0.691338
2,running,0.089679
3,mm,0.083894
4,neutral,0.083889
5,within,0.078106
6,reaching,0.075212
7,2007,0.069429
8,approximately,0.069427
9,financial,0.066537


### Address 106

Unnamed: 0,token,similarity
0,center,0.706404
1,fitness,0.706394
2,external,0.083522
3,british,0.07786
4,gave,0.075029
5,increased,0.075029
6,take,0.075029
7,business,0.072197
8,effects,0.069365
9,legislative,0.060872


### Address 107

Unnamed: 0,token,similarity
0,board,0.717635
1,directors,0.717635
2,ethnic,0.085001
3,goals,0.076641
4,cup,0.06828
5,iihf,0.06828
6,16,0.062706
7,summer,0.059919
8,mexican,0.059919
9,shape,0.057132


### Address 108

Unnamed: 0,token,similarity
0,social,0.70852
1,justice,0.70852
2,rican,0.095975
3,leaders,0.090329
4,debut,0.087506
5,color,0.084684
6,serie,0.081861
7,label,0.076215
8,links,0.07057
9,moving,0.07057


### Address 109

Unnamed: 0,token,similarity
0,life,0.723878
1,mission,0.723878
2,league,0.118804
3,executives,0.088413
4,20,0.080124
5,state,0.077361
6,green,0.074598
7,professional,0.071835
8,measures,0.071835
9,reaching,0.066309


### Address 110

Unnamed: 0,token,similarity
0,extension,0.566202
1,school,0.566202
2,harvard,0.562272
3,anna,0.071931
4,war,0.071907
5,ethnic,0.06353
6,effects,0.062482
7,return,0.061236
8,soundtrack,0.060212
9,olympics,0.058769


### Address 111

Unnamed: 0,token,similarity
0,24,0.720417
1,november,0.720417
2,regiment,0.087449
3,sub,0.084673
4,state,0.076345
5,hospitals,0.068016
6,successive,0.068016
7,2nd,0.062464
8,uk,0.062464
9,houses,0.059688


### Address 112

Unnamed: 0,token,similarity
0,social,0.720441
1,change,0.720392
2,debut,0.090229
3,garrison,0.090226
4,current,0.079122
5,flys,0.079118
6,3rd,0.076345
7,dorchester,0.070793
8,alberto,0.062466
9,hockey,0.062465


### Address 113

Unnamed: 0,token,similarity
0,awards,0.707107
1,alumni,0.707107
2,need,0.096167
3,snail,0.093338
4,increased,0.087681
5,candidate,0.084853
6,colors,0.082024
7,iihf,0.079196
8,club,0.079196
9,released,0.076368


### Address 114

Unnamed: 0,token,similarity
0,20,0.705691
1,february,0.705691
2,engagement,0.079355
3,life,0.079355
4,wide,0.076521
5,moderately,0.070853
6,chamber,0.070853
7,1958,0.068018
8,hampshire,0.06235
9,manager,0.059516


### Address 115

Unnamed: 0,token,similarity
0,young,0.709225
1,children,0.709225
2,second,0.071909
3,olympics,0.06909
4,film,0.06627
5,2015,0.06627
6,player,0.06063
7,youth,0.06063
8,board,0.06063
9,served,0.06063


### Address 116

Unnamed: 0,token,similarity
0,social,0.597006
1,justice,0.567368
2,addressing,0.560613
3,debut,0.088426
4,moving,0.086939
5,leaders,0.081644
6,serie,0.081471
7,rican,0.07264
8,1958,0.072451
9,color,0.070129


### Address 117

Unnamed: 0,token,similarity
0,health,0.717656
1,equity,0.717614
2,mm,0.107297
3,fernando,0.087787
4,running,0.082216
5,luciano,0.073853
6,positions,0.073853
7,increased,0.068281
8,2007,0.06828
9,4,0.06828


### Address 118

Unnamed: 0,token,similarity
0,health,0.703594
1,services,0.703531
2,change,0.081014
3,commissioner,0.078172
4,running,0.075333
5,president,0.075331
6,anna,0.07533
7,mission,0.072485
8,24,0.063959
9,designated,0.063958


### Address 119

Unnamed: 0,token,similarity
0,remove,0.712741
1,barriers,0.712741
2,campaign,0.07857
3,goals,0.072958
4,full,0.070152
5,university,0.070152
6,9,0.070152
7,running,0.067346
8,black,0.067346
9,base,0.067346


### Address 120

Unnamed: 0,token,similarity
0,care,0.689215
1,primary,0.68919
2,school,0.091409
3,opinions,0.085607
4,active,0.082705
5,screenings,0.082704
6,training,0.079801
7,maria,0.076902
8,impressed,0.073999
9,registered,0.071098


### Address 121

Unnamed: 0,token,similarity
0,wide,0.73212
1,range,0.73212
2,get,0.109272
3,currently,0.079222
4,neutral,0.07649
5,facto,0.07649
6,20,0.07649
7,italian,0.073758
8,label,0.071027
9,competed,0.071027


### Address 122

Unnamed: 0,token,similarity
0,public,0.71345
1,school,0.713435
2,mba,0.082697
3,served,0.079894
4,championships,0.074288
5,far,0.074287
6,primary,0.063075
7,candidates,0.063073
8,national,0.063073
9,shape,0.060271


### Address 123

Unnamed: 0,token,similarity
0,color,0.703562
1,black,0.703562
2,within,0.09523
3,label,0.083859
4,played,0.078174
5,initial,0.078174
6,international,0.075331
7,1,0.072488
8,bigger,0.072488
9,hockey,0.072488


### Address 124

Unnamed: 0,token,similarity
0,social,0.702145
1,services,0.702134
2,serie,0.092575
3,change,0.092574
4,effects,0.08118
5,hockey,0.078332
6,solid,0.075484
7,community,0.072635
8,3rd,0.066938
9,side,0.066938


### Address 125

Unnamed: 0,token,similarity
0,june,0.697137
1,21,0.697137
2,positions,0.088935
3,part,0.086066
4,leaders,0.071722
5,legislative,0.071722
6,color,0.068853
7,family,0.065984
8,director,0.065984
9,distance,0.063115


### Address 126

Unnamed: 0,token,similarity
0,21st,0.585379
1,annual,0.585379
2,men,0.581955
3,labels,0.079876
4,professional,0.076453
5,serie,0.075312
6,’,0.074171
7,massachusetts,0.07303
8,born,0.069606
9,harvard,0.067324


### Address 127

Unnamed: 0,token,similarity
0,annual,0.712741
1,21st,0.712741
2,foot,0.072958
3,drama,0.072958
4,9,0.070152
5,marine,0.070152
6,1939,0.067346
7,field,0.06454
8,house,0.06454
9,’,0.061733


### Address 128

Unnamed: 0,token,similarity
0,reaching,0.725948
1,far,0.725948
2,2007,0.106068
3,italian,0.097803
4,public,0.095048
5,estate,0.084028
6,including,0.075763
7,exams,0.070253
8,gastropod,0.067498
9,field,0.067498


### Address 129

Unnamed: 0,token,similarity
0,life,0.70993
1,everyday,0.70993
2,clinic,0.084515
3,olympics,0.081698
4,professional,0.078881
5,hockey,0.067612
6,caribbean,0.067612
7,20,0.067612
8,approximately,0.064795
9,followers,0.061978


### Address 130

Unnamed: 0,token,similarity
0,far,0.606564
1,reaching,0.580042
2,effects,0.547753
3,clinic,0.08764
4,2007,0.085334
5,rican,0.081875
6,public,0.077262
7,including,0.074956
8,estate,0.072649
9,commissioner,0.064577


### Address 131

Unnamed: 0,token,similarity
0,health,0.695735
1,mental,0.695667
2,initial,0.080494
3,actually,0.074744
4,freetown,0.071871
5,decided,0.071868
6,running,0.068998
7,financial,0.068998
8,fernando,0.068997
9,within,0.068996


### Address 132

Unnamed: 0,token,similarity
0,task,0.706399
1,force,0.706399
2,brought,0.083522
3,wounded,0.080691
4,government,0.069366
5,wide,0.066535
6,mexican,0.060872
7,impressed,0.060872
8,certification,0.058041
9,battalions,0.058041


### Address 133

Unnamed: 0,token,similarity
0,former,0.726636
1,mayor,0.726636
2,team,0.085325
3,recommendations,0.07982
4,away,0.077067
5,addressing,0.074315
6,barriers,0.071563
7,luciano,0.071563
8,olympics,0.071563
9,april,0.06881


### Address 134

Unnamed: 0,token,similarity
0,april,0.692098
1,2020,0.692098
2,commissioner,0.091028
3,mayor,0.085248
4,breaking,0.079468
5,attacked,0.076579
6,key,0.067909
7,professional,0.067909
8,based,0.06502
9,led,0.06502


### Address 135

Unnamed: 0,token,similarity
0,health,0.710643
1,public,0.710624
2,participation,0.083024
3,south,0.074581
4,less,0.071768
5,financial,0.071768
6,professional,0.066139
7,mm,0.066139
8,16,0.066138
9,commissioner,0.063323


### Address 136

Unnamed: 0,token,similarity
0,health,0.599349
1,public,0.582677
2,issues,0.581896
3,participation,0.094223
4,chief,0.091391
5,total,0.088965
6,iihf,0.087014
7,far,0.085699
8,co,0.08341
9,board,0.067854


### Address 137

Unnamed: 0,token,similarity
0,july,0.715542
1,2021,0.715542
2,brown,0.095033
3,facto,0.081057
4,broken,0.067082
5,elongated,0.064287
6,signatures,0.061492
7,solid,0.061492
8,hampshire,0.061492
9,films,0.058697


### Address 138

Unnamed: 0,token,similarity
0,towards,0.710634
1,step,0.710634
2,commonwealth,0.108354
3,following,0.097096
4,1939,0.083025
5,website,0.083025
6,history,0.083025
7,university,0.074581
8,opinions,0.071767
9,fifa,0.071767


### Address 139

Unnamed: 0,token,similarity
0,initial,0.594445
1,step,0.589916
2,towards,0.58199
3,commonwealth,0.108699
4,1939,0.086053
5,website,0.082656
6,history,0.081524
7,following,0.078127
8,university,0.076995
9,royal,0.073598


### Address 140

Unnamed: 0,token,similarity
0,step,0.718331
1,initial,0.718331
2,luciano,0.114153
3,service,0.077958
4,history,0.077958
5,university,0.075174
6,street,0.075174
7,1939,0.07239
8,run,0.07239
9,commonwealth,0.07239


### Address 141

Unnamed: 0,token,similarity
0,key,0.697137
1,recommendations,0.697137
2,18,0.088935
3,attacked,0.086066
4,400,0.080329
5,followers,0.07746
6,businesses,0.074591
7,april,0.071722
8,mayor,0.071722
9,label,0.068853


### Address 142

Unnamed: 0,token,similarity
0,18,0.629122
1,key,0.580986
2,recommendations,0.576509
3,label,0.080599
4,years,0.07836
5,governor,0.07836
6,examinations,0.076122
7,attacked,0.076122
8,april,0.075002
9,business,0.073883


### Address 143

Unnamed: 0,token,similarity
0,8,0.68775
1,goals,0.68775
2,sierra,0.106143
3,issues,0.082879
4,running,0.079971
5,generally,0.077063
6,breaking,0.074155
7,freetown,0.071247
8,committee,0.071247
9,ethnic,0.065431


### Address 144

Unnamed: 0,token,similarity
0,community,0.710634
1,engagement,0.710634
2,february,0.091468
3,july,0.085839
4,organized,0.071767
5,served,0.071767
6,total,0.071767
7,footballer,0.068953
8,sea,0.068953
9,gastropod,0.068953


### Address 145

Unnamed: 0,token,similarity
0,full,0.713442
1,list,0.713442
2,remain,0.085501
3,return,0.082698
4,serie,0.074288
5,problems,0.074288
6,1,0.071484
7,houses,0.068681
8,state,0.068681
9,mm,0.063074


### Address 146

Unnamed: 0,token,similarity
0,list,0.700714
1,detailed,0.700714
2,return,0.0842
3,belong,0.072783
4,problems,0.067074
5,brought,0.067074
6,certified,0.06422
7,italian,0.06422
8,training,0.058512
9,79,0.058512


### Address 147

Unnamed: 0,token,similarity
0,boston,1.0
1,trustees,0.082
2,3rd,0.078
3,training,0.076
4,goals,0.074
5,london,0.072
6,society,0.068
7,little,0.068
8,cup,0.068
9,length,0.066


### Address 148

Unnamed: 0,token,similarity
0,part,0.707814
1,take,0.707814
2,royal,0.091832
3,flys,0.086181
4,three,0.083355
5,primary,0.072053
6,olympics,0.072053
7,barriers,0.066402
8,last,0.063576
9,national,0.063576


### Address 149

Unnamed: 0,token,similarity
0,free,0.695701
1,boston,0.695701
2,training,0.080494
3,goals,0.06612
4,recommendations,0.06612
5,trustees,0.06612
6,little,0.063246
7,battalion,0.063246
8,cup,0.060371
9,outer,0.060371


### Address 150

Unnamed: 0,token,similarity
0,racism,0.714843
1,addressing,0.714843
2,approximately,0.096525
3,moving,0.07694
4,new,0.071344
5,government,0.071344
6,years,0.068547
7,foot,0.065749
8,awards,0.060153
9,children,0.060153


### Address 151

Unnamed: 0,token,similarity
0,boston,0.704982
1,mayor,0.704982
2,olympics,0.103549
3,people,0.086527
4,goals,0.080853
5,april,0.072342
6,governor,0.072342
7,7,0.072342
8,commissioner,0.069505
9,president,0.066668


### Address 152

Unnamed: 0,token,similarity
0,sub,0.715542
1,committee,0.715542
2,colors,0.114598
3,drama,0.086648
4,following,0.083853
5,november,0.075467
6,6th,0.069877
7,belong,0.067082
8,player,0.067082
9,facto,0.064287


### Address 153

Unnamed: 0,token,similarity
0,acting,0.694262
1,mayor,0.694262
2,health,0.080661
3,olympics,0.0749
4,1939,0.072019
5,hospitals,0.069138
6,key,0.069138
7,high,0.066257
8,take,0.066257
9,flys,0.066257


### Address 154

Unnamed: 0,token,similarity
0,position,0.71624
1,acting,0.71624
2,government,0.085167
3,served,0.079582
4,regiment,0.079582
5,external,0.07679
6,white,0.073998
7,battalions,0.071205
8,step,0.071205
9,high,0.071205


### Address 155

Unnamed: 0,token,similarity
0,extension,0.7
1,harvard,0.7
2,ethnic,0.12
3,dorchester,0.085714
4,anna,0.068571
5,infantry,0.065714
6,community,0.065714
7,representatives,0.062857
8,olympics,0.06
9,entity,0.06


### Address 156

Unnamed: 0,token,similarity
0,top,0.70214
1,100,0.70214
2,2015,0.084029
3,links,0.08118
4,fernando,0.08118
5,actually,0.072635
6,return,0.069787
7,made,0.06409
8,1958,0.061241
9,luciano,0.061241


### Address 157

Unnamed: 0,token,similarity
0,commonwealth,0.707814
1,institute,0.707814
2,deputy,0.097483
3,houses,0.094658
4,belong,0.094658
5,equity,0.089006
6,fernandez,0.083355
7,2013,0.077704
8,towards,0.074878
9,2nd,0.074878


### Address 158

Unnamed: 0,token,similarity
0,led,0.709225
1,businesses,0.709225
2,reaching,0.094469
3,recommendations,0.088829
4,inequality,0.086009
5,training,0.083189
6,july,0.077549
7,000,0.077549
8,january,0.077549
9,2020,0.07191


### Address 159

Unnamed: 0,token,similarity
0,real,0.713442
1,estate,0.713442
2,gonzalez,0.085501
3,moderately,0.071484
4,made,0.071484
5,trustees,0.071484
6,co,0.065878
7,chamber,0.065878
8,increased,0.065878
9,within,0.065878


### Address 160

Unnamed: 0,token,similarity
0,association,0.69356
1,mba,0.693523
2,2017,0.105258
3,school,0.105257
4,football,0.096641
5,puerto,0.082189
6,square,0.073536
7,24,0.067767
8,failing,0.062001
9,equity,0.062


### Address 161

Unnamed: 0,token,similarity
0,journal,0.716938
1,business,0.716938
2,commissioner,0.108796
3,running,0.0809
4,18,0.0809
5,governor,0.072531
6,e,0.069741
7,based,0.069741
8,conical,0.066951
9,role,0.066951


### Address 162

Unnamed: 0,token,similarity
0,chamber,0.718332
1,commerce,0.718332
2,independent,0.103017
3,primary,0.094664
4,distance,0.09188
5,political,0.086311
6,inequality,0.069606
7,diverse,0.069606
8,20,0.069606
9,freetown,0.069606


### Address 163

Unnamed: 0,token,similarity
0,director,0.719722
1,executive,0.719722
2,house,0.086144
3,solid,0.075029
4,participation,0.066692
5,executives,0.061135
6,ethnic,0.061135
7,financial,0.058356
8,governor,0.058356
9,signatures,0.058356


### Address 164

Unnamed: 0,token,similarity
0,hampshire,0.692098
1,new,0.692098
2,national,0.088138
3,financial,0.076579
4,shape,0.076579
5,2013,0.070799
6,racism,0.070799
7,running,0.067909
8,decides,0.067909
9,moderately,0.06502


### Address 165

Unnamed: 0,token,similarity
0,member,0.7
1,board,0.7
2,remove,0.08
3,iihf,0.08
4,service,0.074286
5,film,0.071429
6,following,0.068571
7,goals,0.062857
8,sea,0.06
9,houses,0.057143


### Address 166

Unnamed: 0,token,similarity
0,business,0.711337
1,leaders,0.711337
2,time,0.106841
3,center,0.081537
4,running,0.078725
5,designated,0.075913
6,1958,0.067479
7,commissioner,0.064667
8,outer,0.061855
9,marine,0.061855


### Address 167

Unnamed: 0,token,similarity
0,league,0.724615
1,massachusetts,0.724523
2,green,0.100751
3,professional,0.084188
4,men,0.073149
5,soundtrack,0.067629
6,passed,0.067626
7,diverse,0.064862
8,estate,0.064861
9,problems,0.062111


### Address 168

Unnamed: 0,token,similarity
0,trustees,0.691375
1,board,0.691375
2,mexican,0.098355
3,website,0.095462
4,campaign,0.092569
5,real,0.086784
6,moderately,0.078105
7,sea,0.07232
8,boston,0.07232
9,battalions,0.066534


### Address 169

Unnamed: 0,token,similarity
0,executives,0.71624
1,chief,0.71624
2,issues,0.07679
3,run,0.073998
4,bigger,0.073998
5,mission,0.073998
6,lowest,0.071205
7,moving,0.071205
8,remain,0.062828
9,4,0.062828


### Address 170

Unnamed: 0,token,similarity
0,national,0.707107
1,international,0.707107
2,player,0.093338
3,screenings,0.084853
4,runners,0.082024
5,time,0.082024
6,infantry,0.076368
7,hampshire,0.070711
8,primary,0.070711
9,south,0.067882


### Address 171

Unnamed: 0,token,similarity
0,hockey,0.696429
1,field,0.69641
2,10th,0.104822
3,clinic,0.090462
4,attacked,0.087591
5,fernando,0.076104
6,high,0.073232
7,directed,0.07036
8,north,0.067487
9,real,0.064617


### Address 172

Unnamed: 0,token,similarity
0,team,0.602116
1,hockey,0.583655
2,field,0.57362
3,attacked,0.091222
4,north,0.086293
5,high,0.07974
6,far,0.076127
7,real,0.073965
8,fernando,0.072004
9,10th,0.070446


### Address 173

Unnamed: 0,token,similarity
0,national,0.716938
1,team,0.716938
2,2017,0.097637
3,uk,0.083689
4,actually,0.0809
5,former,0.072531
6,ukrainian,0.069741
7,votes,0.064162
8,screenings,0.061372
9,member,0.061372


### Address 174

Unnamed: 0,token,similarity
0,–,0.707107
1,2011,0.707107
2,current,0.082024
3,part,0.076368
4,record,0.076368
5,chamber,0.073539
6,battalions,0.073539
7,november,0.073539
8,annual,0.067882
9,remove,0.065054


### Address 175

Unnamed: 0,token,similarity
0,olympics,0.72111
1,summer,0.72111
2,recommendations,0.108167
3,central,0.077658
4,chamber,0.074885
5,primary,0.072111
6,team,0.066564
7,mayor,0.066564
8,side,0.066564
9,royal,0.066564


### Address 176

Unnamed: 0,token,similarity
0,caribbean,0.673795
1,central,0.673795
2,housing,0.100921
3,olympics,0.092016
4,public,0.083111
5,problems,0.080143
6,gastropod,0.077175
7,detailed,0.074207
8,side,0.071238
9,april,0.06827


### Address 177

Unnamed: 0,token,similarity
0,political,0.71136
1,party,0.711313
2,ethnic,0.084346
3,2013,0.081538
4,olympics,0.081536
5,outer,0.073104
6,elections,0.070289
7,brown,0.067479
8,leone,0.067478
9,13,0.064668


### Address 178

Unnamed: 0,token,similarity
0,party,0.521409
1,puerto,0.514233
2,political,0.503608
3,rican,0.484607
4,links,0.085016
5,outer,0.084973
6,law,0.063258
7,external,0.061302
8,association,0.061296
9,goals,0.061293


### Address 179

Unnamed: 0,token,similarity
0,party,0.609213
1,rican,0.573513
2,political,0.553937
3,outer,0.092131
4,olympics,0.084069
5,since,0.078311
6,links,0.074856
7,brown,0.067947
8,ethnic,0.067946
9,caribbean,0.059885


### Address 180

Unnamed: 0,token,similarity
0,electoral,0.714867
1,state,0.714819
2,football,0.085335
3,get,0.079739
4,november,0.079736
5,released,0.076937
6,–,0.06855
7,commissioner,0.06855
8,would,0.068548
9,soundtrack,0.068547


### Address 181

Unnamed: 0,token,similarity
0,commission,0.697137
1,electoral,0.697137
2,neutral,0.109017
3,issues,0.088935
4,housing,0.088935
5,footballer,0.080329
6,–,0.07746
7,commissioner,0.07746
8,torres,0.074591
9,competed,0.074591


### Address 182

Unnamed: 0,token,similarity
0,2007,0.712039
1,may,0.712039
2,regiment,0.088478
3,kingdom,0.085669
4,2013,0.074434
5,various,0.068816
6,caribbean,0.068816
7,foot,0.063199
8,rican,0.063199
9,champions,0.06039


### Address 183

Unnamed: 0,token,similarity
0,time,0.705691
1,first,0.705691
2,certified,0.093525
3,leaders,0.085023
4,men,0.079355
5,mental,0.076521
6,within,0.070853
7,outer,0.068018
8,national,0.059516
9,partition,0.056682


### Address 184

Unnamed: 0,token,similarity
0,electoral,0.700735
1,signatures,0.700693
2,india,0.095616
3,football,0.081347
4,issues,0.072786
5,intact,0.069928
6,born,0.069928
7,neutral,0.067079
8,care,0.064222
9,australia,0.064222


### Address 185

Unnamed: 0,token,similarity
0,2007,0.70993
1,april,0.70993
2,key,0.087333
3,university,0.073247
4,9,0.07043
5,attacked,0.067612
6,21,0.067612
7,reaching,0.064795
8,commissioner,0.064795
9,based,0.061978


### Address 186

Unnamed: 0,token,similarity
0,electoral,0.585133
1,state,0.570339
2,commission,0.562413
3,belong,0.103047
4,neutral,0.090684
5,get,0.084617
6,–,0.082938
7,housing,0.08214
8,commissioner,0.07953
9,6th,0.077294


### Address 187

Unnamed: 0,token,similarity
0,founder,0.714143
1,co,0.714143
2,13,0.084017
3,followers,0.075615
4,institute,0.072815
5,torres,0.072815
6,officials,0.067213
7,battalion,0.067213
8,parties,0.067213
9,screenings,0.067213


### Address 188

Unnamed: 0,token,similarity
0,party,0.708525
1,founder,0.708515
2,parties,0.084684
3,flys,0.081861
4,13,0.079038
5,brown,0.07057
6,elections,0.067747
7,equity,0.064924
8,2020,0.064924
9,get,0.062101


### Address 189

Unnamed: 0,token,similarity
0,may,0.706399
1,9,0.706399
2,second,0.106172
3,remove,0.086353
4,votes,0.086353
5,director,0.083522
6,detailed,0.083522
7,diverse,0.080691
8,registered,0.075028
9,ethnic,0.072197


### Address 190

Unnamed: 0,token,similarity
0,gave,0.588511
1,commission,0.580819
2,electoral,0.569513
3,champions,0.09068
4,neutral,0.089306
5,issues,0.077586
6,internal,0.073365
7,belong,0.07259
8,torres,0.070506
9,get,0.070151


### Address 191

Unnamed: 0,token,similarity
0,parties,0.705
1,registered,0.704964
2,following,0.109222
3,may,0.080852
4,economic,0.069504
5,key,0.066666
6,elected,0.063833
7,fernando,0.063831
8,originally,0.063831
9,name,0.058157


### Address 192

Unnamed: 0,token,similarity
0,three,0.712741
1,two,0.712741
2,label,0.081376
3,examinations,0.07857
4,5th,0.075764
5,gave,0.067346
6,goals,0.067346
7,executives,0.061733
8,anna,0.061733
9,executive,0.061733


### Address 193

Unnamed: 0,token,similarity
0,parties,0.706399
1,political,0.706399
2,designated,0.07786
3,followers,0.075028
4,outer,0.075028
5,led,0.066535
6,puerto,0.066535
7,elected,0.060872
8,need,0.058041
9,player,0.058041


### Address 194

Unnamed: 0,token,similarity
0,registered,0.68775
1,currently,0.68775
2,get,0.082879
3,1,0.071247
4,awards,0.068339
5,outer,0.065431
6,battalions,0.065431
7,running,0.062523
8,somewhat,0.059615
9,current,0.059615


### Address 195

Unnamed: 0,token,similarity
0,political,0.515581
1,parties,0.499399
2,registered,0.483483
3,currently,0.475363
4,outer,0.099309
5,following,0.070932
6,8,0.07091
7,facto,0.068895
8,acting,0.060789
9,association,0.056743


### Address 196

Unnamed: 0,token,similarity
0,puerto,0.70145
1,rican,0.701405
2,businesses,0.088393
3,links,0.088392
4,vision,0.074135
5,external,0.074134
6,association,0.068435
7,diverse,0.068431
8,goals,0.065582
9,youth,0.065582


### Address 197

Unnamed: 0,token,similarity
0,commission,0.570327
1,decides,0.554007
2,electoral,0.553743
3,neutral,0.118098
4,competed,0.088223
5,commissioner,0.083507
6,torres,0.0823
7,issues,0.070564
8,indian,0.069052
9,base,0.06653


### Address 198

Unnamed: 0,token,similarity
0,commissioner,0.729383
1,resident,0.729383
2,founder,0.082261
3,april,0.082261
4,officials,0.079519
5,100,0.074035
6,hockey,0.065809
7,intact,0.063067
8,health,0.060325
9,total,0.060325


### Address 199

Unnamed: 0,token,similarity
0,including,0.716938
1,governor,0.716938
2,puerto,0.07532
3,indian,0.072531
4,equity,0.072531
5,luciano,0.064162
6,2013,0.064162
7,recommendations,0.061372
8,reaching,0.058582
9,somewhat,0.058582


### Address 200

Unnamed: 0,token,similarity
0,legislative,0.697854
1,houses,0.697854
2,ukrainian,0.090277
3,green,0.081679
4,background,0.075947
5,commonwealth,0.073081
6,colors,0.067349
7,league,0.067349
8,1,0.064483
9,serie,0.061617


### Address 201

Unnamed: 0,token,similarity
0,house,0.714143
1,representatives,0.714143
2,5,0.106421
3,key,0.070014
4,premier,0.064413
5,6th,0.064413
6,group,0.061612
7,whorls,0.061612
8,campaign,0.058812
9,internal,0.058812


### Address 202

Unnamed: 0,token,similarity
0,positions,0.69857
1,elected,0.69857
2,maria,0.077301
3,distance,0.074438
4,commonwealth,0.071575
5,–,0.071575
6,equity,0.068712
7,iihf,0.065849
8,square,0.065849
9,status,0.062986


### Address 203

Unnamed: 0,token,similarity
0,candidate,0.721803
1,votes,0.721803
2,world,0.092823
3,awards,0.087281
4,bigger,0.078969
5,engagement,0.078969
6,belong,0.073427
7,ukrainian,0.073427
8,2012,0.073427
9,fernando,0.073427


### Address 204

Unnamed: 0,token,similarity
0,elections,0.703562
1,2008,0.703562
2,000,0.089544
3,board,0.078174
4,championship,0.072488
5,wide,0.061118
6,record,0.061118
7,sub,0.061118
8,take,0.058275
9,screenings,0.058275


### Address 205

Unnamed: 0,token,similarity
0,next,0.697854
1,elections,0.697854
2,party,0.081679
3,passed,0.075947
4,inequality,0.073081
5,within,0.064483
6,19,0.064483
7,18,0.061617
8,ceo,0.058752
9,struggles,0.058752


### Address 206

Unnamed: 0,token,similarity
0,electoral,0.709934
1,law,0.709926
2,commissioner,0.084516
3,’,0.078881
4,football,0.076064
5,party,0.067612
6,19,0.064795
7,black,0.064795
8,titles,0.064795
9,snail,0.064795


### Address 207

Unnamed: 0,token,similarity
0,law,0.58501
1,since,0.580422
2,electoral,0.578128
3,ii,0.111267
4,party,0.095208
5,based,0.083737
6,issues,0.078001
7,19,0.076854
8,titles,0.076854
9,dorchester,0.073413


### Address 208

Unnamed: 0,token,similarity
0,party,0.707113
1,votes,0.7071
2,2013,0.101824
3,elections,0.098995
4,mba,0.096167
5,may,0.093338
6,entity,0.084853
7,brown,0.07354
8,external,0.070711
9,fernando,0.067882


### Address 209

Unnamed: 0,token,similarity
0,1946,0.715542
1,since,0.715542
2,ii,0.100623
3,designated,0.089443
4,professional,0.086648
5,party,0.078262
6,fernando,0.078262
7,snail,0.075467
8,gastropod,0.072672
9,england,0.072672


### Address 210

Unnamed: 0,token,similarity
0,1946,0.587285
1,since,0.587285
2,running,0.575859
3,ii,0.094834
4,executives,0.083408
5,business,0.082266
6,distance,0.077695
7,3rd,0.076553
8,professional,0.071982
9,gastropod,0.07084


### Address 211

Unnamed: 0,token,similarity
0,running,0.70852
1,since,0.70852
2,ii,0.098798
3,remove,0.081861
4,business,0.07057
5,titles,0.067747
6,health,0.067747
7,increased,0.064924
8,–,0.064924
9,outer,0.062101


### Address 212

Unnamed: 0,token,similarity
0,candidates,0.717635
1,party,0.717635
2,2013,0.10451
3,mba,0.079428
4,since,0.079428
5,brown,0.073854
6,elections,0.071067
7,world,0.071067
8,economic,0.071067
9,released,0.06828


### Address 213

Unnamed: 0,token,similarity
0,party,0.88426
1,candidates,0.493313
2,2013,0.110385
3,elections,0.085541
4,since,0.081474
5,brown,0.078543
6,mba,0.076638
7,links,0.071324
8,parties,0.068649
9,world,0.068613


### Address 214

Unnamed: 0,token,similarity
0,away,0.689202
1,breaking,0.689202
2,state,0.100116
3,mayor,0.097214
4,designated,0.082704
5,6th,0.082704
6,garrison,0.079802
7,vision,0.073999
8,war,0.073999
9,battalions,0.071097


### Address 215

Unnamed: 0,token,similarity
0,officials,0.711337
1,party,0.711337
2,1946,0.078725
3,get,0.075913
4,law,0.073102
5,internal,0.07029
6,may,0.067479
7,elections,0.064667
8,july,0.064667
9,background,0.064667


### Address 216

Unnamed: 0,token,similarity
0,struggles,0.730753
1,internal,0.730753
2,officials,0.093055
3,return,0.082107
4,three,0.076633
5,based,0.071159
6,competed,0.071159
7,2021,0.068423
8,training,0.068423
9,running,0.065686


### Address 217

Unnamed: 0,token,similarity
0,electoral,0.723194
1,party,0.723181
2,2013,0.112004
3,mba,0.095411
4,get,0.076053
5,approximately,0.073287
6,housing,0.064991
7,journal,0.06499
8,neutral,0.062226
9,football,0.062225


### Address 218

Unnamed: 0,token,similarity
0,entity,0.724569
1,independent,0.724569
2,length,0.117311
3,competed,0.084188
4,2020,0.081428
5,generally,0.081428
6,impressed,0.073147
7,chamber,0.070387
8,led,0.067626
9,79,0.064866


### Address 219

Unnamed: 0,token,similarity
0,commission,0.58645
1,certification,0.573523
2,electoral,0.541791
3,neutral,0.083443
4,australia,0.081092
5,torres,0.078742
6,background,0.076391
7,footballer,0.070515
8,issues,0.066989
9,elongated,0.063464


### Address 220

Unnamed: 0,token,similarity
0,2012,0.710634
1,elections,0.710634
2,candidate,0.091468
3,inequality,0.085839
4,18,0.085839
5,system,0.083025
6,party,0.063324
7,ended,0.063324
8,impressed,0.063324
9,need,0.060509


### Address 221

Unnamed: 0,token,similarity
0,run,0.709935
1,governor,0.709925
2,colors,0.070429
3,recommendations,0.067613
4,olympics,0.067613
5,executive,0.067613
6,iihf,0.067612
7,18,0.064795
8,made,0.064795
9,parties,0.064795


### Address 222

Unnamed: 0,token,similarity
0,would,0.70993
1,run,0.70993
2,conical,0.09015
3,freetown,0.078881
4,senior,0.078881
5,fernandez,0.073247
6,housing,0.073247
7,player,0.07043
8,university,0.07043
9,football,0.07043


### Address 223

Unnamed: 0,token,similarity
0,campaign,0.719027
1,attacked,0.719027
2,measures,0.0904
3,towards,0.084837
4,co,0.084837
5,representatives,0.079274
6,deputy,0.076492
7,labels,0.076492
8,april,0.073711
9,failing,0.070929


### Address 224

Unnamed: 0,token,similarity
0,government,0.714843
1,current,0.714843
2,5,0.085333
3,position,0.085333
4,drama,0.07694
5,ii,0.074142
6,white,0.071344
7,currently,0.068547
8,one,0.068547
9,18,0.065749


### Address 225

Unnamed: 0,token,similarity
0,inequality,0.706399
1,social,0.706399
2,businesses,0.080691
3,film,0.07786
4,serie,0.075028
5,directed,0.075028
6,mayor,0.072197
7,initial,0.066535
8,16,0.063703
9,history,0.063703


### Address 226

Unnamed: 0,token,similarity
0,de,0.704982
1,facto,0.704982
2,club,0.080853
3,white,0.072342
4,side,0.069505
5,range,0.069505
6,political,0.069505
7,runners,0.066668
8,committee,0.066668
9,2012,0.066668


### Address 227

Unnamed: 0,token,similarity
0,certified,0.707107
1,party,0.707107
2,elections,0.084853
3,executives,0.070711
4,drama,0.067882
5,get,0.065054
6,broken,0.059397
7,mayor,0.059397
8,brown,0.059397
9,board,0.059397


### Address 228

Unnamed: 0,token,similarity
0,get,0.70993
1,failing,0.70993
2,goals,0.092967
3,world,0.073247
4,full,0.07043
5,1958,0.064795
6,foot,0.064795
7,january,0.064795
8,state,0.064795
9,time,0.059161


### Address 229

Unnamed: 0,token,similarity
0,parties,0.711358
1,green,0.711315
2,united,0.075911
3,young,0.0731
4,ice,0.073099
5,legislative,0.07029
6,followers,0.06467
7,white,0.064665
8,list,0.061856
9,party,0.059045


### Address 230

Unnamed: 0,token,similarity
0,organized,0.687023
1,originally,0.687023
2,made,0.096067
3,engagement,0.093156
4,legislative,0.069867
5,campaign,0.066956
6,australia,0.066956
7,management,0.064044
8,failing,0.064044
9,club,0.055311


### Address 231

Unnamed: 0,token,similarity
0,puerto,0.701428
1,rico,0.701426
2,association,0.08554
3,businesses,0.074135
4,kingdom,0.074135
5,governor,0.071283
6,flys,0.068432
7,label,0.065581
8,outer,0.062729
9,cup,0.062729


### Address 232

Unnamed: 0,token,similarity
0,participation,0.684836
1,citizen,0.684836
2,championship,0.080311
3,role,0.077391
4,titles,0.07447
5,gastropod,0.07447
6,public,0.07447
7,royal,0.07447
8,massachusetts,0.06863
9,side,0.06863


### Address 233

Unnamed: 0,token,similarity
0,economic,0.719027
1,issues,0.719027
2,uk,0.0904
3,4th,0.079274
4,central,0.079274
5,england,0.076492
6,8,0.076492
7,black,0.076492
8,based,0.073711
9,july,0.070929


### Address 234

Unnamed: 0,token,similarity
0,political,0.712751
1,status,0.712731
2,chamber,0.095405
3,outer,0.075765
4,puerto,0.075765
5,ethnic,0.070151
6,elected,0.067346
7,boston,0.064539
8,issues,0.058928
9,length,0.058927


### Address 235

Unnamed: 0,token,similarity
0,neutral,0.690652
1,one,0.690652
2,italian,0.09701
3,clinic,0.091218
4,commission,0.073843
5,olympics,0.065156
6,commissioner,0.06226
7,boston,0.06226
8,team,0.06226
9,building,0.059364


### Address 236

Unnamed: 0,token,similarity
0,neutral,0.715542
1,position,0.715542
2,fifa,0.086648
3,–,0.078262
4,electoral,0.072672
5,reaching,0.067082
6,21,0.067082
7,international,0.064287
8,mayor,0.064287
9,step,0.064287


### Address 237

Unnamed: 0,token,similarity
0,actually,0.68775
1,people,0.68775
2,moving,0.094511
3,breaking,0.079971
4,labels,0.077063
5,management,0.077063
6,citizen,0.071247
7,health,0.068339
8,species,0.068339
9,white,0.068339


### Address 238

Unnamed: 0,token,similarity
0,opinions,0.713442
1,diverse,0.713442
2,served,0.088304
3,base,0.085501
4,e,0.077091
5,broken,0.074288
6,management,0.071484
7,indian,0.071484
8,remove,0.071484
9,films,0.068681


### Address 239

Unnamed: 0,token,similarity
0,party,0.704982
1,followers,0.704982
2,parties,0.106386
3,executives,0.066668
4,signatures,0.063831
5,society,0.060994
6,get,0.060994
7,2020,0.060994
8,hampshire,0.060994
9,governor,0.058157


### Address 240

Unnamed: 0,token,similarity
0,daily,0.692098
1,life,0.692098
2,league,0.073689
3,runners,0.073689
4,business,0.070799
5,partition,0.070799
6,measures,0.067909
7,2011,0.06502
8,examinations,0.06213
9,including,0.06213


### Address 241

Unnamed: 0,token,similarity
0,bigger,0.724569
1,problems,0.724569
2,breaking,0.086948
3,battalion,0.070387
4,garrison,0.067626
5,annual,0.067626
6,external,0.067626
7,lowest,0.062106
8,league,0.062106
9,freetown,0.062106


### Address 242

Unnamed: 0,token,similarity
0,political,0.714157
1,organizations,0.714128
2,freetown,0.067214
3,facto,0.067214
4,outer,0.064415
5,issues,0.064413
6,committee,0.061613
7,chamber,0.061612
8,executives,0.05881
9,soundtrack,0.05881


### Address 243

Unnamed: 0,token,similarity
0,16,0.70852
1,january,0.70852
2,ukrainian,0.087506
3,equity,0.07057
4,within,0.067747
5,candidate,0.064924
6,failing,0.062101
7,bigger,0.062101
8,8,0.059279
9,mollusk,0.059279


### Address 244

Unnamed: 0,token,similarity
0,ice,0.697137
1,hockey,0.697137
2,mental,0.086066
3,color,0.086066
4,life,0.083197
5,general,0.074591
6,various,0.071722
7,career,0.071722
8,iraq,0.068853
9,south,0.065984


### Address 245

Unnamed: 0,token,similarity
0,player,0.578131
1,hockey,0.576841
2,ice,0.567704
3,color,0.086219
4,time,0.08337
5,mental,0.080508
6,actually,0.071721
7,life,0.063318
8,high,0.060582
9,administration,0.058592


### Address 246

Unnamed: 0,token,similarity
0,hockey,0.707108
1,professional,0.707106
2,men,0.130108
3,2020,0.079196
4,life,0.076368
5,everyday,0.070711
6,massachusetts,0.070711
7,freetown,0.067882
8,base,0.067881
9,ethnic,0.067881


### Address 247

Unnamed: 0,token,similarity
0,world,0.681186
1,championship,0.681165
2,career,0.093955
3,candidate,0.091021
4,government,0.079276
5,one,0.073403
6,battalion,0.070467
7,effects,0.070466
8,economic,0.067531
9,13,0.06753


### Address 248

Unnamed: 0,token,similarity
0,hockey,0.723884
1,team,0.723873
2,actually,0.093938
3,mexican,0.074598
4,iraq,0.071836
5,served,0.071835
6,former,0.071835
7,2017,0.071834
8,real,0.069073
9,life,0.06631


### Address 249

Unnamed: 0,token,similarity
0,hockey,0.593309
1,team,0.584479
2,ice,0.547295
3,actually,0.095915
4,life,0.080165
5,color,0.073291
6,iraq,0.071976
7,high,0.070744
8,caribbean,0.070744
9,october,0.068416


### Address 250

Unnamed: 0,token,similarity
0,competed,0.704982
1,iihf,0.704982
2,commission,0.117733
3,need,0.106386
4,film,0.097875
5,chief,0.086527
6,north,0.086527
7,000,0.069505
8,run,0.066668
9,belong,0.066668


### Address 251

Unnamed: 0,token,similarity
0,iihf,0.617544
1,2013,0.600516
2,competed,0.543757
3,need,0.08741
4,000,0.082869
5,commission,0.077193
6,film,0.074923
7,hospitals,0.068111
8,positions,0.068111
9,13,0.066976


### Address 252

Unnamed: 0,token,similarity
0,iihf,0.739595
1,2013,0.739595
2,mba,0.082478
3,public,0.082478
4,wounded,0.079773
5,positions,0.077069
6,000,0.074365
7,party,0.074365
8,ethnic,0.074365
9,issues,0.074365


### Address 253

Unnamed: 0,token,similarity
0,–,0.689202
1,2015,0.689202
2,electoral,0.09141
3,part,0.082704
4,barriers,0.068195
5,followers,0.068195
6,sierra,0.062391
7,men,0.062391
8,position,0.059489
9,base,0.059489


### Address 254

Unnamed: 0,token,similarity
0,external,0.601664
1,links,0.56861
2,references,0.554712
3,top,0.090149
4,party,0.088274
5,states,0.087811
6,distance,0.086671
7,puerto,0.082517
8,5th,0.079659
9,footballer,0.076471


### Address 255

Unnamed: 0,token,similarity
0,footballer,0.705698
1,professional,0.705685
2,community,0.116198
3,league,0.09636
4,men,0.096359
5,commission,0.082189
6,gastropod,0.082189
7,since,0.082189
8,mission,0.079354
9,founded,0.076521


### Address 256

Unnamed: 0,token,similarity
0,footballer,0.717638
1,ukrainian,0.717632
2,legislative,0.093362
3,league,0.090576
4,external,0.07664
5,community,0.073854
6,state,0.071066
7,official,0.06828
8,passed,0.06828
9,housing,0.06828


### Address 257

Unnamed: 0,token,similarity
0,january,0.604702
1,21,0.589925
2,born,0.564919
3,impressed,0.097753
4,positions,0.081839
5,color,0.081839
6,8,0.081839
7,outer,0.077293
8,’,0.071609
9,active,0.065926


### Address 258

Unnamed: 0,token,similarity
0,21,0.701427
1,born,0.701427
2,positions,0.08554
3,impressed,0.082689
4,color,0.079837
5,ii,0.079837
6,labels,0.071283
7,england,0.068432
8,8,0.068432
9,mexican,0.065581


### Address 259

Unnamed: 0,token,similarity
0,team,0.706399
1,youth,0.706399
2,olympics,0.089185
3,street,0.086353
4,puerto,0.083522
5,2017,0.075028
6,full,0.066535
7,freetown,0.058041
8,former,0.058041
9,army,0.058041


### Address 260

Unnamed: 0,token,similarity
0,team,0.573502
1,youth,0.572331
2,system,0.562968
3,elongated,0.079588
4,family,0.074906
5,2017,0.073736
6,diverse,0.073736
7,manager,0.064373
8,mission,0.064373
9,executives,0.063202


### Address 261

Unnamed: 0,token,similarity
0,career,0.712741
1,club,0.712741
2,de,0.098212
3,20,0.095406
4,remove,0.072958
5,economic,0.070152
6,24,0.067346
7,india,0.067346
8,gonzalez,0.067346
9,kingdom,0.06454


### Address 262

Unnamed: 0,token,similarity
0,premier,0.713442
1,league,0.713442
2,originally,0.088304
3,houses,0.077091
4,active,0.074288
5,life,0.074288
6,conical,0.074288
7,problems,0.068681
8,association,0.065878
9,mayor,0.065878


### Address 263

Unnamed: 0,token,similarity
0,debut,0.711337
1,made,0.711337
2,rico,0.081537
3,chief,0.081537
4,struggles,0.07029
5,saw,0.07029
6,governor,0.07029
7,brought,0.067479
8,status,0.067479
9,000,0.059044


### Address 264

Unnamed: 0,token,similarity
0,october,0.697137
1,1,0.697137
2,currently,0.083197
3,army,0.083197
4,last,0.080329
5,black,0.07746
6,label,0.07746
7,united,0.074591
8,colors,0.071722
9,active,0.068853


### Address 265

Unnamed: 0,token,similarity
0,team,0.718331
1,fifa,0.718331
2,uk,0.094664
3,actually,0.094664
4,real,0.07239
5,2008,0.07239
6,position,0.069606
7,white,0.069606
8,attacked,0.064037
9,side,0.064037


### Address 266

Unnamed: 0,token,similarity
0,football,0.690657
1,team,0.690647
2,2017,0.108593
3,former,0.091218
4,iraq,0.076739
5,puerto,0.070948
6,solid,0.068052
7,marine,0.065156
8,north,0.065156
9,79,0.06226


### Address 267

Unnamed: 0,token,similarity
0,world,0.710655
1,cup,0.710612
2,candidate,0.088657
3,businesses,0.074582
4,000,0.071766
5,part,0.066136
6,october,0.063325
7,elections,0.063322
8,1st,0.060511
9,party,0.060508


### Address 268

Unnamed: 0,token,similarity
0,career,0.704982
1,international,0.704982
2,time,0.092201
3,side,0.089364
4,1st,0.08369
5,broken,0.080853
6,fifa,0.075179
7,approximately,0.075179
8,years,0.066668
9,gonzalez,0.066668


### Address 269

Unnamed: 0,token,similarity
0,first,0.718331
1,second,0.718331
2,moving,0.09188
3,debut,0.083527
4,may,0.083527
5,young,0.083527
6,conical,0.07239
7,5th,0.069606
8,soundtrack,0.066822
9,clinic,0.066822


### Address 270

Unnamed: 0,token,similarity
0,indian,0.723879
1,army,0.723879
2,diverse,0.107753
3,1,0.091176
4,based,0.074598
5,second,0.071835
6,people,0.069072
7,north,0.069072
8,senior,0.069072
9,common,0.069072


### Address 271

Unnamed: 0,token,similarity
0,army,0.623152
1,indian,0.588093
2,british,0.556972
3,diverse,0.097167
4,senior,0.088047
5,common,0.083664
6,people,0.077091
7,recommendations,0.07678
8,19,0.076676
9,1,0.074633


### Address 272

Unnamed: 0,token,similarity
0,royal,0.710634
1,18th,0.710634
2,kingdom,0.085839
3,generally,0.08021
4,initial,0.074581
5,participation,0.074581
6,young,0.071767
7,leone,0.066138
8,free,0.060509
9,mexican,0.060509


### Address 273

Unnamed: 0,token,similarity
0,moving,0.694262
1,away,0.694262
2,first,0.100826
3,role,0.089303
4,former,0.086423
5,6th,0.080661
6,since,0.080661
7,state,0.07778
8,green,0.07778
9,may,0.0749


### Address 274

Unnamed: 0,token,similarity
0,indian,0.70287
1,government,0.702833
2,second,0.085368
3,diverse,0.079679
4,references,0.079676
5,common,0.076831
6,campaign,0.073985
7,film,0.071139
8,new,0.068292
9,recommendations,0.065449


### Address 275

Unnamed: 0,token,similarity
0,indian,0.576635
1,decided,0.567256
2,government,0.562568
3,film,0.094933
4,competed,0.075009
5,detailed,0.071493
6,united,0.069149
7,decides,0.067977
8,common,0.065633
9,register,0.065633


### Address 276

Unnamed: 0,token,similarity
0,remain,0.711337
1,intact,0.711337
2,stripes,0.129334
3,government,0.092783
4,financial,0.089971
5,position,0.075913
6,moderately,0.073102
7,–,0.073102
8,links,0.064667
9,square,0.061855


### Address 277

Unnamed: 0,token,similarity
0,indian,0.719062
1,infantry,0.718992
2,based,0.090403
3,current,0.082056
4,second,0.073714
5,2nd,0.073711
6,rico,0.06815
7,massachusetts,0.065367
8,winners,0.065366
9,recommendations,0.062587


### Address 278

Unnamed: 0,token,similarity
0,infantry,0.59455
1,indian,0.584246
2,regiment,0.567056
3,current,0.085918
4,second,0.083627
5,5th,0.079044
6,film,0.074462
7,ended,0.072171
8,league,0.068735
9,titles,0.066443


### Address 279

Unnamed: 0,token,similarity
0,battalion,0.694272
1,4th,0.694252
2,professional,0.095065
3,problems,0.089304
4,hospitals,0.086423
5,mm,0.083542
6,official,0.069139
7,record,0.069139
8,participation,0.066257
9,label,0.063377


### Address 280

Unnamed: 0,token,similarity
0,battalion,0.712766
1,training,0.712717
2,increased,0.098211
3,alumni,0.084181
4,primary,0.08418
5,boston,0.081375
6,problems,0.072961
7,recommendations,0.072956
8,19,0.067348
9,institute,0.067346


### Address 281

Unnamed: 0,token,similarity
0,battalions,0.722501
1,active,0.72249
2,acting,0.069205
3,links,0.069204
4,organizations,0.066436
5,within,0.063668
6,designated,0.0609
7,society,0.058132
8,care,0.058131
9,daily,0.052596


### Address 282

Unnamed: 0,token,similarity
0,india,0.704982
1,partition,0.704982
2,sierra,0.078016
3,daily,0.078016
4,winners,0.078016
5,remain,0.075179
6,school,0.072342
7,2021,0.072342
8,little,0.069505
9,april,0.069505


### Address 283

Unnamed: 0,token,similarity
0,indian,0.608443
1,army,0.597373
2,new,0.564898
3,diverse,0.094457
4,based,0.081487
5,1,0.080777
6,north,0.078034
7,task,0.077836
8,rico,0.073408
9,financial,0.062322


### Address 284

Unnamed: 0,token,similarity
0,indian,0.714164
1,new,0.714122
2,diverse,0.08122
3,based,0.078418
4,management,0.078415
5,4th,0.078414
6,rico,0.067215
7,massachusetts,0.067214
8,would,0.064411
9,task,0.061614


### Address 285

Unnamed: 0,token,similarity
0,war,0.720429
1,1939,0.720404
2,fifa,0.081899
3,saw,0.079121
4,8,0.073569
5,step,0.073569
6,following,0.073569
7,13,0.070792
8,common,0.070791
9,mayor,0.070789


### Address 286

Unnamed: 0,token,similarity
0,world,0.70214
1,war,0.70214
2,away,0.103968
3,candidate,0.101119
4,debut,0.089726
5,fifa,0.08118
6,candidates,0.075484
7,failing,0.075484
8,government,0.072635
9,facility,0.069787


### Address 287

Unnamed: 0,token,similarity
0,ii,0.579293
1,world,0.572328
2,war,0.571167
3,government,0.098677
4,october,0.092873
5,candidate,0.085907
6,away,0.084746
7,fifa,0.082425
8,failing,0.07662
9,11,0.069655


### Address 288

Unnamed: 0,token,similarity
0,world,0.707107
1,ii,0.707107
2,government,0.113137
3,candidate,0.093338
4,october,0.093338
5,fitness,0.082024
6,key,0.076368
7,gastropod,0.076368
8,failing,0.073539
9,since,0.067882


### Address 289

Unnamed: 0,token,similarity
0,10th,0.687023
1,designated,0.687023
2,field,0.1048
3,states,0.081511
4,issues,0.072778
5,made,0.072778
6,north,0.072778
7,films,0.069867
8,–,0.066956
9,business,0.064044


### Address 290

Unnamed: 0,token,similarity
0,battalion,0.566584
1,10th,0.562979
2,designated,0.53657
3,return,0.087628
4,field,0.081626
5,’,0.078025
6,everyday,0.072023
7,league,0.070823
8,problems,0.064821
9,measures,0.064821


### Address 291

Unnamed: 0,token,similarity
0,war,0.700714
1,iraq,0.700714
2,italian,0.092763
3,debut,0.092763
4,moderately,0.081346
5,engagement,0.078491
6,sierra,0.078491
7,background,0.078491
8,candidates,0.072783
9,campaign,0.072783


### Address 292

Unnamed: 0,token,similarity
0,service,0.679706
1,active,0.679706
2,labels,0.088273
3,alumni,0.085331
4,director,0.082389
5,ii,0.073561
6,gonzalez,0.070619
7,second,0.067676
8,19,0.067676
9,base,0.067676


### Address 293

Unnamed: 0,token,similarity
0,saw,0.581713
1,service,0.560168
2,active,0.529047
3,director,0.081392
4,intact,0.078998
5,19,0.077801
6,base,0.070619
7,labels,0.069422
8,born,0.068226
9,links,0.065832


### Address 294

Unnamed: 0,token,similarity
0,active,0.69282
1,saw,0.69282
2,intact,0.092376
3,1,0.072169
4,stripes,0.069282
5,italian,0.066395
6,care,0.063509
7,october,0.060622
8,1946,0.060622
9,director,0.060622


### Address 295

Unnamed: 0,token,similarity
0,2nd,0.593158
1,battalions,0.592193
2,5th,0.585484
3,north,0.09721
4,executives,0.084607
5,daily,0.084607
6,indian,0.084472
7,colors,0.062276
8,brown,0.062243
9,background,0.061023


### Address 296

Unnamed: 0,token,similarity
0,battalions,0.719723
1,2nd,0.719721
2,daily,0.080587
3,executives,0.077808
4,north,0.07503
5,institute,0.072251
6,fernandez,0.069471
7,indian,0.066694
8,whorls,0.061136
9,group,0.058356


### Address 297

Unnamed: 0,token,similarity
0,battalions,0.70993
1,5th,0.70993
2,north,0.087333
3,two,0.076064
4,references,0.07043
5,board,0.067612
6,society,0.067612
7,colors,0.064795
8,21st,0.064795
9,brown,0.061978


### Address 298

Unnamed: 0,token,similarity
0,wounded,0.719027
1,400,0.719027
2,massachusetts,0.0904
3,1st,0.076492
4,record,0.076492
5,championships,0.076492
6,general,0.073711
7,elongated,0.073711
8,2017,0.068148
9,australia,0.065366


### Address 299

Unnamed: 0,token,similarity
0,approximately,0.702851
1,1,0.702851
2,legislative,0.093903
3,within,0.088212
4,wide,0.079675
5,register,0.07683
6,belong,0.073984
7,premier,0.073984
8,runners,0.071139
9,racism,0.071139


### Address 300

Unnamed: 0,token,similarity
0,wounded,0.53197
1,approximately,0.50027
2,400,0.50027
3,1,0.486401
4,runners,0.084204
5,elongated,0.07826
6,wide,0.074297
7,register,0.068354
8,legislative,0.068354
9,within,0.068354


### Address 301

Unnamed: 0,token,similarity
0,wounded,0.712741
1,war,0.712741
2,common,0.0926
3,1st,0.084182
4,cup,0.075764
5,away,0.070152
6,2017,0.061733
7,white,0.061733
8,record,0.061733
9,harvard,0.061733


### Address 302

Unnamed: 0,token,similarity
0,battalions,0.702851
1,1st,0.702851
2,links,0.085367
3,born,0.073984
4,legislative,0.073984
5,colors,0.062602
6,leone,0.062602
7,next,0.059757
8,de,0.056911
9,field,0.054065


### Address 303

Unnamed: 0,token,similarity
0,3rd,0.709225
1,battalions,0.709225
2,social,0.083189
3,2011,0.080369
4,8,0.077549
5,board,0.074729
6,wide,0.071909
7,organizations,0.071909
8,society,0.071909
9,sea,0.071909


### Address 304

Unnamed: 0,token,similarity
0,role,0.69282
1,garrison,0.69282
2,participation,0.101036
3,away,0.09815
4,gonzalez,0.095263
5,administration,0.075056
6,bigger,0.075056
7,ended,0.069282
8,’,0.069282
9,vision,0.069282


### Address 305

Unnamed: 0,token,similarity
0,war,0.718331
1,following,0.718331
2,commissioner,0.100232
3,1939,0.077958
4,website,0.077958
5,registered,0.075174
6,step,0.075174
7,fernando,0.07239
8,debut,0.07239
9,member,0.069606


### Address 306

Unnamed: 0,token,similarity
0,battalion,0.705699
1,2nd,0.705684
2,19,0.110529
3,institute,0.093524
4,official,0.079355
5,everyday,0.079354
6,record,0.070853
7,ii,0.070853
8,measures,0.070852
9,whorls,0.070852


### Address 307

Unnamed: 0,token,similarity
0,may,0.707107
1,1946,0.707107
2,diverse,0.093338
3,officials,0.082024
4,street,0.076368
5,detailed,0.073539
6,daily,0.070711
7,links,0.067882
8,party,0.065054
9,external,0.065054


### Address 308

Unnamed: 0,token,similarity
0,battalion,0.694272
1,6th,0.694253
2,professional,0.086423
3,alumni,0.080661
4,increased,0.08066
5,official,0.077781
6,signatures,0.07778
7,sub,0.074899
8,problems,0.07202
9,life,0.072018


### Address 309

Unnamed: 0,token,similarity
0,battalion,0.716949
1,5th,0.716927
2,external,0.111585
3,two,0.083688
4,kingdom,0.072531
5,mm,0.069741
6,problems,0.066952
7,mexican,0.066951
8,champions,0.064162
9,league,0.064161


### Address 310

Unnamed: 0,token,similarity
0,battalion,0.712741
1,1st,0.712741
2,wounded,0.07857
3,examinations,0.075764
4,official,0.072958
5,international,0.070152
6,2012,0.070152
7,key,0.067346
8,somewhat,0.06454
9,india,0.061733


### Address 311

Unnamed: 0,token,similarity
0,battalion,0.69642
1,3rd,0.69642
2,professional,0.084719
3,championship,0.084719
4,social,0.084719
5,boston,0.084719
6,status,0.076104
7,based,0.076104
8,men,0.073232
9,approximately,0.07036


### Address 312

Unnamed: 0,token,similarity
0,army,0.709238
1,infantry,0.709212
2,votes,0.091647
3,co,0.083188
4,national,0.083188
5,people,0.071911
6,training,0.06627
7,based,0.060631
8,indian,0.057836
9,1,0.057812


### Address 313

Unnamed: 0,token,similarity
0,indian,0.609749
1,army,0.594192
2,infantry,0.578152
3,based,0.090892
4,second,0.080859
5,current,0.076185
6,1,0.074421
7,rico,0.074061
8,people,0.070825
9,north,0.069714


In [19]:
memory.bins[np.argwhere((memory.chunk_scores > 0.97).cpu().detach().numpy().flatten()).flatten()]

MAPTensor([1.9413e-07, 9.9996e-01, 7.3651e-08], device='cuda:0')

In [20]:
np.argwhere((memory.bins > 120 ).cpu().detach().numpy().flatten()).flatten()

array([], dtype=int64)

## Appendix

In [21]:
text = "couldn't."
inputs = tokenizer(text, return_tensors="pt")

In [22]:
outputs = model(**inputs, output_attentions=True)
attention_matrix = outputs.attentions

In [23]:
encoding = tokenizer.encode(text)
labels = tokenizer.convert_ids_to_tokens(encoding)

In [24]:
i = 0
averages_idx = []
while i < len(labels) - 1:
    j = i + 1
    average_idx = []
    while labels[j].startswith('#'):
        average_idx.append(j)
        labels[i] += labels[j].replace('#', '')
        j += 1
    if average_idx != []:
        average_idx.append(i)
        averages_idx.append(average_idx)
    i = j

hashtag_idx = np.array([label.startswith("#") for label in labels])
punctuation_idx = np.array([label in string.punctuation for label in labels])
remove_idx = hashtag_idx | punctuation_idx
labels = np.array(labels)[~remove_idx]
labels = labels[1:(len(labels) - 1)]
print(labels)

['couldn' 't']


In [25]:
layer = 0

for head in range(12):
    head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()
    
    head_scores_raw_tensor = preprocess_attention_scores(head_scores_raw_tensor, averages_idx, remove_idx)s
        
    head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()
    
    head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()s

    as_threshold = 0.4
    head_scores[head_scores < as_threshold] = 0
    plot_heatmap(head_scores, labels)
    
    G = nx.from_numpy_array(head_scores, create_using = nx.DiGraph())
    G.edges.data()

    sequences = []
    #mean_scores = []
    n_tokens = len(labels)
    construct_sequences(G, n_tokens)
    # for seq in sequences:
    #     idx = list(itertools.chain(*np.argwhere(seq == 1)))
    #     mean = 0
    #     for i, j in zip(idx[:-1],  idx[1:]):
    #         mean += G[i][j]['weight']
    #     mean /= (len(idx) - 1)
    #     mean_scores.append(round(mean, 2))
        
    # df = pd.DataFrame(data=[sequences, mean_scores]).T.rename(columns={0: 'seq',  1: 'score'})
    # if len(df) > 0:
    #     df['len'] = df['seq'].map(sum)
    #     df['score'] = df['score'].astype('float64')
    #     df = df.sort_values(by=['score', 'len'], ascending=[False, False]).reset_index(drop=True)
    #     top3_df = df.head(3)
    #     display(df)
    
    #     for i in range(len(top3_df)):
    #         print(labels[top3_df['seq'][i].astype(bool)], top3_df['score'][i])
    
    #if sequences != []:
        #layer_sequences.append(sequences)
    if sequences != []:
        print(head)
        for seq in sequences:
            print(labels[seq.astype(bool)])

SyntaxError: invalid syntax (135269879.py, line 6)

In [None]:
# text = "Firenze firenze"
# encoding = tokenizer.encode(text)
# labels = tokenizer.convert_ids_to_tokens(encoding)

In [None]:
# i = 0
# averages_idx = []
# while i < len(labels) - 1:
#     j = i + 1
#     average_idx = []
#     while labels[j].startswith('#'):
#         average_idx.append(j)
#         labels[i] += labels[j].replace('#', '')
#         j += 1
#     if average_idx != []:
#         average_idx.append(i)
#         averages_idx.append(average_idx)
#     i = j

# hashtag_idx = np.array([label.startswith("#") for label in labels])
# labels = np.array(labels)[~hashtag_idx]

In [None]:
# Torch implementation.

# t = torch.tensor(head_scores_raw)
# i = torch.tensor(averages_idx)

# t[i] = torch.mean(t[i], dim=1, keepdim=True)
# t = torch.unique_consecutive(t, dim=0)
# t = torch.transpose(t, 0, 1)
# t[i] = torch.mean(t[i], dim=1, keepdim=True)
# t = torch.unique_consecutive(t, dim=0)

# t = torch.transpose(t, 0, 1)