In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
%%capture
import datasets

import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, utils 

import math
import numpy as np
import random

import pandas as pd
import pathlib
import pickle

import string


### Package options ###
pd.set_option('display.max_rows', 500)

In [3]:
# Set seed.
utils.fix_seed(41)

Using seed: 41

In [4]:
### Utils ###
def get_results(
    sentences: list[str],
    retrieve_mode: str,
    remove_stopwords_inference: bool = True
) -> None:
    
    # Retrieve content from memory.
    retrieved_contents = inference.infer(
        memory.address_size,
        cleanup,
        memory,
        sentences,
        retrieve_mode=retrieve_mode,
        remove_stopwords=remove_stopwords_inference,
        k=7, 
    )
    
    if retrieve_mode == "top_k":
        sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
    
        for s, addresses in zip(sentences, retrieved_contents):
            display(md(f"<ins>**Sentence:**</ins> {s}"))
#             out_tables = []
            display(md("<ins>**Addresses:**</ins>"))
            for a in addresses:
                address_sims_df = inference.get_similarities_to_atomic_set(
                    a, cleanup, k=11)
#                 out = widgets.Output()
#                 with out:
#                     display(address_sims_df)
#                 out_tables.append(out)
#             display(widgets.HBox(out_tables))
                display(address_sims_df)
        return
    elif retrieve_mode == "pooling":  
        sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 

        for s, c in zip(sentences, retrieved_contents):
            sentence_sims_df = inference.get_similarities_to_atomic_set(
                c, cleanup)
            sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
            sims_df = pd.concat([sims_df, sentence_sims_df])

        sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
                         .set_index(['sentence', 'token'])

        display(sims_df)
        return
    else:  # unrecognized
        display(md("Unrecognized retrieval mode."))
        return

In [5]:
in_sentences = [
    """Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.""",
    """In 1910, she was elected to the position of organizer and lecturer of the National WCTU.""",
    """Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.""",
    """With all other games played, a victory over Everton had put United top of the group on nine points.""",
    """The 2022 FA Women's League Cup Final was the 11th final of the FA Women's League Cup, England's secondary cup competition for women's football teams and its primary league cup tournament.""",
    """In 2020 Mico's single 'igare' awarded as the best song of the summer in Kiss Summer Awards.""",
    """She collected the speech and words of Dublin city and donated her collection to the Department of Irish Folklore at University College, Dublin.""",
    """Traditional palyanytsya was baked from yeast dough.""",
   """First, hops were boiled in a pot, which was then poured into a makitra, to which sifted wheat flour was added.""",
     """ Jonathan Holland of ScreenDaily deemed the film to be "superbly directed by Palomero, who seems to have a special gift for seeing the world through children's eyes." """   
]

In [6]:
out_sentences = [
    """As the population of all of the towns grew, the need for better transportation between them also grew.""",
    """The construction of the line was the subject of a legal challenge.""",
    """The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.""",
    """Dangerous heat stress events will spread rapidly across the world as global heating continues.""",
    """Whether or not history will determine that we are living in an ever more divided culture, it certainly feels that way."""
]

In [22]:
# Parametrized cell.
filename = "2023-09-10 17-02-12-946618.pkl"
experiment_no = 3

In [23]:
# Load memory and associated cleanup.
cleanup = pickle.load(open(f"cleanups/method2/cleanup_{filename}",'rb'))
memory = pickle.load(open(f"memories/method2/memory_{filename}",'rb'))

In [24]:
display(md(f"# Mining Transfomer attention - Experiment {experiment_no}"))

# Mining Transfomer attention - Experiment 3

## Memory visualiztion
### Statistics

In [9]:
display(md(f"Number of trained articles: {len(memory.wiki_articles)}"))
display(md(f"Number of memory updates: {memory.n_updates}"))
display(md(f"Number of memory expansions: {memory.n_expansions}"))
display(md(f"Updates percentage: {round(memory.n_updates / (memory.n_updates + memory.n_expansions), 3)}%"))
display(md(f"Number of existing memory addresses: {len(memory.addresses)}"))

Number of trained articles: 12

Number of memory updates: 1673

Number of memory expansions: 3159

Updates percentage: 0.346%

Number of existing memory addresses: 3159

### Addresses

In [None]:
display(md("Visualize 30 randomly sampled addresses."))

In [10]:
addresses = np.random.randint(0, len(memory.addresses), size=30)

for address in addresses:
    display(md(f"### <ins>Address {address}</ins>"))
    display(md(f"Address **chunk score:** {memory.scores[address][0]}, **bin score:** {memory.scores[address][1]}"))
    address_sims_df = inference.get_similarities_to_atomic_set(
            memory.addresses[address],
            cleanup,
    )
    display(address_sims_df)

### <ins>Address 1984</ins>

Address **chunk score:** 0.57, **bin score:** 0.2349659478641115

Unnamed: 0,token,similarity
0,last,0.51
1,as,0.51
2,s,0.49
3,game,0.47
4,removed,0.11
5,feature,0.1
6,mm,0.09
7,intention,0.08
8,beach,0.08
9,1932,0.08


### <ins>Address 931</ins>

Address **chunk score:** 1.0, **bin score:** 0.7685002927901223

Unnamed: 0,token,similarity
0,publish,0.36
1,had,0.34
2,the,0.32
3,to,0.32
4,her,0.32
5,in,0.31
6,collection,0.3
7,intention,0.3
8,book,0.28
9,been,0.27


### <ins>Address 1104</ins>

Address **chunk score:** 0.63, **bin score:** 0.5246204100985778

Unnamed: 0,token,similarity
0,acquired,0.71
1,rights,0.71
2,header,0.11
3,above,0.1
4,references,0.09
5,gifted,0.08
6,summer,0.08
7,connected,0.08
8,september,0.08
9,24,0.08


### <ins>Address 321</ins>

Address **chunk score:** 0.68, **bin score:** 1.3291769462521188

Unnamed: 0,token,similarity
0,as,0.46
1,president,0.45
2,served,0.45
3,the,0.44
4,of,0.44
5,reform,0.1
6,mary,0.1
7,lived,0.09
8,person,0.09
9,should,0.09


### <ins>Address 243</ins>

Address **chunk score:** 0.83, **bin score:** 1.2388165019947337

Unnamed: 0,token,similarity
0,york,0.61
1,new,0.59
2,varick,0.58
3,21,0.1
4,sophie,0.09
5,stages,0.09
6,19,0.08
7,bhradaigh,0.08
8,leaf,0.08
9,debut,0.08


### <ins>Address 1625</ins>

Address **chunk score:** 0.63, **bin score:** 0.30127643622108735

Unnamed: 0,token,similarity
0,history,0.58
1,club,0.56
2,most,0.56
3,penalties,0.11
4,5,0.1
5,frame,0.09
6,perfect,0.09
7,rounds,0.08
8,1910,0.08
9,author,0.08


### <ins>Address 407</ins>

Address **chunk score:** 0.51, **bin score:** 0.9187316500319866

Unnamed: 0,token,similarity
0,production,0.71
1,cast,0.71
2,added,0.1
3,bread,0.1
4,girls,0.1
5,apex,0.1
6,is,0.1
7,filipino,0.1
8,tracks,0.09
9,straight,0.09


### <ins>Address 282</ins>

Address **chunk score:** 0.7, **bin score:** 1.2407241313776467

Unnamed: 0,token,similarity
0,she,0.49
1,of,0.43
2,officer,0.43
3,an,0.42
4,became,0.41
5,fine,0.09
6,0,0.09
7,world,0.09
8,competed,0.09
9,brazier,0.08


### <ins>Address 2925</ins>

Address **chunk score:** 0.56, **bin score:** 0.04813802003627643

Unnamed: 0,token,similarity
0,yellow,0.7
1,two,0.7
2,geneva,0.12
3,added,0.09
4,railway,0.09
5,mico,0.09
6,ran,0.09
7,if,0.09
8,references,0.08
9,legion,0.08


### <ins>Address 1260</ins>

Address **chunk score:** 0.77, **bin score:** 0.47501515498152

Unnamed: 0,token,similarity
0,were,0.71
1,and,0.71
2,commission,0.1
3,either,0.09
4,university,0.09
5,chattanooga,0.08
6,baldridge,0.08
7,1903,0.07
8,before,0.07
9,penelope,0.07


### <ins>Address 2488</ins>

Address **chunk score:** 0.72, **bin score:** 0.13380783527099993

Unnamed: 0,token,similarity
0,tottenham,0.7
1,city,0.7
2,meetings,0.11
3,goal,0.11
4,lane,0.1
5,tie,0.1
6,keeper,0.1
7,secretary,0.1
8,yellow,0.1
9,imperforate,0.08


### <ins>Address 2179</ins>

Address **chunk score:** 0.56, **bin score:** 0.23277345680980943

Unnamed: 0,token,similarity
0,shot,0.47
1,ball,0.47
2,city,0.45
3,demi,0.44
4,the,0.4
5,elimination,0.1
6,interpreted,0.09
7,april,0.09
8,tomas,0.09
9,tottenham,0.09


### <ins>Address 2723</ins>

Address **chunk score:** 0.73, **bin score:** 0.12188693202915601

Unnamed: 0,token,similarity
0,halfway,0.72
1,the,0.72
2,1932,0.09
3,however,0.09
4,capitol,0.08
5,trustee,0.08
6,off,0.08
7,january,0.08
8,civil,0.08
9,julian,0.08


### <ins>Address 2246</ins>

Address **chunk score:** 0.83, **bin score:** 0.1831312241993146

Unnamed: 0,token,similarity
0,goal,0.72
1,on,0.72
2,professional,0.09
3,dry,0.09
4,constructed,0.08
5,share,0.08
6,aperture,0.08
7,saloon,0.08
8,interpreted,0.08
9,doubled,0.08


### <ins>Address 3056</ins>

Address **chunk score:** 0.73, **bin score:** 0.020962219452485442

Unnamed: 0,token,similarity
0,the,0.59
1,aperture,0.58
2,rhomboidal,0.57
3,8,0.1
4,convex,0.09
5,boys,0.08
6,halle,0.08
7,international,0.08
8,days,0.08
9,near,0.07


### <ins>Address 3156</ins>

Address **chunk score:** 0.64, **bin score:** 0.0

Unnamed: 0,token,similarity
0,italian,0.72
1,dagored,0.72
2,sister,0.1
3,youthful,0.09
4,him,0.09
5,pulling,0.09
6,worked,0.09
7,2009,0.08
8,shootout,0.08
9,send,0.08


### <ins>Address 2902</ins>

Address **chunk score:** 0.55, **bin score:** 0.052408357194508426

Unnamed: 0,token,similarity
0,clearing,0.72
1,for,0.72
2,scoring,0.09
3,scored,0.08
4,doing,0.08
5,written,0.08
6,away,0.08
7,sacked,0.08
8,1661,0.08
9,fumbled,0.08


### <ins>Address 957</ins>

Address **chunk score:** 0.83, **bin score:** 0.8153053901914973

Unnamed: 0,token,similarity
0,of,0.58
1,the,0.56
2,society,0.55
3,knew,0.1
4,fa,0.09
5,mother,0.09
6,living,0.08
7,8,0.08
8,mary,0.08
9,wta,0.08


### <ins>Address 1129</ins>

Address **chunk score:** 0.65, **bin score:** 0.5084389583207667

Unnamed: 0,token,similarity
0,see,0.59
1,references,0.58
2,list,0.58
3,europe,0.09
4,links,0.09
5,1915,0.09
6,central,0.09
7,so,0.08
8,places,0.08
9,move,0.08


### <ins>Address 179</ins>

Address **chunk score:** 0.73, **bin score:** 1.289033649925841

Unnamed: 0,token,similarity
0,has,0.59
1,only,0.57
2,remained,0.56
3,rise,0.12
4,burn,0.1
5,scenes,0.09
6,surroundings,0.08
7,deaths,0.08
8,personal,0.08
9,diameter,0.08


### <ins>Address 476</ins>

Address **chunk score:** 0.56, **bin score:** 1.0343044641049346

Unnamed: 0,token,similarity
0,was,0.73
1,in,0.73
2,finding,0.1
3,defence,0.1
4,shape,0.09
5,illness,0.09
6,frequent,0.09
7,obliged,0.09
8,post,0.08
9,world,0.08


### <ins>Address 2904</ins>

Address **chunk score:** 0.57, **bin score:** 0.06191318383207545

Unnamed: 0,token,similarity
0,at,0.69
1,half,0.69
2,near,0.1
3,activists,0.09
4,government,0.09
5,winners,0.09
6,than,0.09
7,plot,0.08
8,recruiting,0.07
9,ovid,0.07


### <ins>Address 1739</ins>

Address **chunk score:** 0.78, **bin score:** 0.296409165035584

Unnamed: 0,token,similarity
0,in,0.57
1,awards,0.57
2,2020,0.57
3,acquired,0.1
4,bid,0.09
5,halle,0.09
6,right,0.09
7,to,0.09
8,wheaton,0.08
9,moved,0.08


### <ins>Address 2038</ins>

Address **chunk score:** 0.75, **bin score:** 0.2829624205187429

Unnamed: 0,token,similarity
0,the,0.54
1,minute,0.5
2,in,0.49
3,87th,0.49
4,bruno,0.11
5,language,0.11
6,illness,0.1
7,8,0.1
8,additional,0.09
9,placed,0.09


### <ins>Address 122</ins>

Address **chunk score:** 0.52, **bin score:** 1.6765581252693664

Unnamed: 0,token,similarity
0,in,0.71
1,by,0.71
2,acquired,0.1
3,titles,0.09
4,frequent,0.09
5,plot,0.09
6,with,0.08
7,move,0.08
8,writers,0.08
9,her,0.08


### <ins>Address 2786</ins>

Address **chunk score:** 0.61, **bin score:** 0.07033686971408315

Unnamed: 0,token,similarity
0,harder,0.71
1,her,0.71
2,straight,0.1
3,built,0.09
4,fran,0.09
5,points,0.09
6,bottom,0.09
7,easter,0.09
8,children,0.09
9,university,0.08


### <ins>Address 796</ins>

Address **chunk score:** 1.0, **bin score:** 0.802039471513126

Unnamed: 0,token,similarity
0,they,0.35
1,near,0.35
2,moved,0.34
3,to,0.34
4,dublin,0.34
5,lived,0.33
6,before,0.32
7,family,0.31
8,the,0.3
9,where,0.29


### <ins>Address 309</ins>

Address **chunk score:** 0.64, **bin score:** 1.1543519304832444

Unnamed: 0,token,similarity
0,married,0.72
1,and,0.72
2,edged,0.1
3,anti,0.09
4,organizing,0.09
5,occurs,0.08
6,number,0.08
7,super,0.08
8,commission,0.08
9,baldridge,0.08


### <ins>Address 1555</ins>

Address **chunk score:** 0.65, **bin score:** 0.3213849641615525

Unnamed: 0,token,similarity
0,2004,0.68
1,births,0.68
2,facilities,0.11
3,opening,0.1
4,shovel,0.09
5,encircled,0.09
6,children,0.09
7,grown,0.08
8,competed,0.08
9,similar,0.08


### <ins>Address 2749</ins>

Address **chunk score:** 0.88, **bin score:** 0.08983558841282502

Unnamed: 0,token,similarity
0,and,0.71
1,shooting,0.71
2,can,0.14
3,finding,0.11
4,sifted,0.09
5,again,0.09
6,well,0.08
7,commission,0.08
8,union,0.08
9,numerous,0.08


## In-sample sentences
### W/ stop words in inference sentence
#### Closest addresses

In [11]:
get_results(in_sentences[:3], "top_k", False)

<ins>**Sentence:**</ins> Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,she,0.39
1,participant,0.36
2,at,0.36
3,meetings,0.35
4,temperance,0.32
5,active,0.32
6,became,0.31
7,a,0.31
8,very,0.3
9,outer,0.11


Unnamed: 0,token,similarity
0,a,0.47
1,and,0.45
2,still,0.44
3,while,0.43
4,young,0.43
5,february,0.11
6,3rd,0.1
7,during,0.1
8,when,0.09
9,la,0.09


Unnamed: 0,token,similarity
0,girl,0.42
1,a,0.41
2,still,0.41
3,and,0.4
4,while,0.4
5,young,0.4
6,february,0.11
7,cutting,0.1
8,during,0.09
9,pictures,0.09


Unnamed: 0,token,similarity
0,and,0.72
1,a,0.72
2,september,0.1
3,formats,0.09
4,corner,0.08
5,football,0.08
6,again,0.08
7,during,0.08
8,longer,0.08
9,penelope,0.08


Unnamed: 0,token,similarity
0,she,0.52
1,young,0.51
2,a,0.48
3,became,0.48
4,september,0.1
5,inside,0.09
6,3rd,0.09
7,should,0.09
8,delegate,0.08
9,control,0.08


Unnamed: 0,token,similarity
0,she,0.71
1,a,0.71
2,corner,0.12
3,hearth,0.1
4,creative,0.09
5,reading,0.09
6,children,0.09
7,fewer,0.09
8,length,0.08
9,carried,0.08


Unnamed: 0,token,similarity
0,she,0.46
1,2011,0.45
2,a,0.43
3,for,0.42
4,in,0.41
5,creative,0.1
6,obliged,0.1
7,now,0.09
8,having,0.09
9,office,0.09


<ins>**Sentence:**</ins> In 1910, she was elected to the position of organizer and lecturer of the National WCTU.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,of,0.52
1,the,0.47
2,national,0.29
3,and,0.27
4,was,0.26
5,position,0.26
6,elected,0.25
7,organizer,0.24
8,she,0.23
9,lecturer,0.21


Unnamed: 0,token,similarity
0,of,0.5
1,the,0.46
2,national,0.27
3,and,0.27
4,position,0.26
5,obliged,0.25
6,was,0.25
7,organizer,0.24
8,she,0.23
9,resign,0.22


Unnamed: 0,token,similarity
0,was,0.43
1,elected,0.42
2,of,0.42
3,the,0.41
4,position,0.41
5,usually,0.38
6,to,0.38
7,name,0.1
8,roof,0.1
9,angela,0.08


Unnamed: 0,token,similarity
0,of,0.53
1,position,0.52
2,organizer,0.5
3,the,0.47
4,wrote,0.1
5,convex,0.09
6,attendance,0.09
7,roof,0.09
8,mary,0.08
9,competed,0.08


Unnamed: 0,token,similarity
0,and,0.59
1,of,0.58
2,the,0.58
3,anti,0.1
4,halle,0.09
5,halfway,0.09
6,fa,0.09
7,sliced,0.09
8,during,0.09
9,knew,0.08


Unnamed: 0,token,similarity
0,the,0.59
1,was,0.58
2,of,0.57
3,caroline,0.1
4,national,0.09
5,fails,0.09
6,roof,0.09
7,messages,0.08
8,living,0.08
9,south,0.08


Unnamed: 0,token,similarity
0,the,0.71
1,of,0.71
2,convex,0.09
3,fa,0.09
4,living,0.09
5,mary,0.08
6,roof,0.08
7,anti,0.08
8,11th,0.08
9,brazier,0.08


<ins>**Sentence:**</ins> Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,of,0.54
1,another,0.31
2,mass,0.31
3,organization,0.29
4,work,0.29
5,the,0.29
6,her,0.27
7,temperance,0.27
8,feature,0.27
9,was,0.26


Unnamed: 0,token,similarity
0,organization,0.4
1,her,0.4
2,work,0.4
3,the,0.38
4,feature,0.37
5,was,0.35
6,of,0.35
7,one,0.1
8,off,0.09
9,see,0.09


Unnamed: 0,token,similarity
0,her,0.45
1,another,0.45
2,work,0.44
3,feature,0.41
4,of,0.37
5,was,0.35
6,off,0.11
7,name,0.1
8,rtve,0.09
9,bottom,0.09


Unnamed: 0,token,similarity
0,children,0.48
1,of,0.44
2,meetings,0.43
3,school,0.43
4,sunday,0.43
5,circular,0.09
6,unusual,0.09
7,pregnant,0.08
8,fell,0.08
9,uncle,0.08


Unnamed: 0,token,similarity
0,of,0.89
1,the,0.45
2,mary,0.1
3,brazier,0.08
4,roof,0.08
5,halfway,0.08
6,mothers,0.08
7,anti,0.08
8,living,0.08
9,convex,0.08


Unnamed: 0,token,similarity
0,meetings,0.52
1,of,0.51
2,children,0.5
3,organization,0.48
4,park,0.08
5,both,0.08
6,member,0.08
7,surrendered,0.08
8,t,0.08
9,mills,0.07


Unnamed: 0,token,similarity
0,the,0.51
1,organization,0.51
2,temperance,0.5
3,of,0.49
4,convex,0.11
5,roof,0.09
6,appearance,0.09
7,relation,0.08
8,snail,0.08
9,matches,0.08


#### Pooled address space

In [12]:
get_results(in_sentences[:3], "pooling", False)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",the,0.76
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",of,0.37
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",in,0.32
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",and,0.25
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",to,0.21
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",usually,0.21
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",a,0.18
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",was,0.14
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",with,0.13
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",bruno,0.11


### W/o stop words in inference sentence

In [13]:
get_results(in_sentences[:3], "top_k", True)

<ins>**Sentence:**</ins> Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,she,0.39
1,participant,0.36
2,at,0.36
3,meetings,0.35
4,temperance,0.32
5,active,0.32
6,became,0.31
7,a,0.31
8,very,0.3
9,outer,0.11


Unnamed: 0,token,similarity
0,blaine,0.72
1,prohibition,0.72
2,ovid,0.1
3,turn,0.09
4,christian,0.09
5,born,0.09
6,tracks,0.09
7,rules,0.09
8,dough,0.08
9,watford,0.08


Unnamed: 0,token,similarity
0,prohibition,0.6
1,was,0.6
2,reared,0.56
3,ovid,0.12
4,125,0.1
5,two,0.1
6,connected,0.1
7,if,0.1
8,i,0.09
9,would,0.09


Unnamed: 0,token,similarity
0,girl,0.42
1,a,0.41
2,still,0.41
3,and,0.4
4,while,0.4
5,young,0.4
6,february,0.11
7,cutting,0.1
8,during,0.09
9,pictures,0.09


Unnamed: 0,token,similarity
0,favor,0.58
1,great,0.56
2,won,0.54
3,instead,0.1
4,dublin,0.1
5,risen,0.1
6,runners,0.09
7,did,0.09
8,2019,0.09
9,interpreted,0.09


Unnamed: 0,token,similarity
0,she,0.52
1,young,0.51
2,a,0.48
3,became,0.48
4,september,0.1
5,inside,0.09
6,3rd,0.09
7,should,0.09
8,delegate,0.08
9,control,0.08


Unnamed: 0,token,similarity
0,she,0.51
1,favor,0.5
2,great,0.5
3,won,0.46
4,did,0.09
5,risen,0.09
6,deal,0.09
7,dublin,0.09
8,until,0.09
9,20th,0.09


<ins>**Sentence:**</ins> In 1910, she was elected to the position of organizer and lecturer of the National WCTU.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,national,0.7
1,organizer,0.7
2,doing,0.09
3,surroundings,0.09
4,debut,0.09
5,civil,0.09
6,affected,0.09
7,increase,0.08
8,don,0.08
9,lives,0.07


Unnamed: 0,token,similarity
0,position,0.7
1,elected,0.7
2,website,0.09
3,1917,0.09
4,name,0.09
5,dough,0.09
6,common,0.08
7,eight,0.08
8,media,0.08
9,weir,0.08


Unnamed: 0,token,similarity
0,of,0.61
1,position,0.59
2,lecturer,0.57
3,struck,0.1
4,brazier,0.08
5,mothers,0.08
6,served,0.08
7,primary,0.08
8,started,0.08
9,snail,0.08


Unnamed: 0,token,similarity
0,of,0.52
1,the,0.47
2,national,0.29
3,and,0.27
4,was,0.26
5,position,0.26
6,elected,0.25
7,organizer,0.24
8,she,0.23
9,lecturer,0.21


Unnamed: 0,token,similarity
0,of,0.53
1,position,0.52
2,organizer,0.5
3,the,0.47
4,wrote,0.1
5,convex,0.09
6,attendance,0.09
7,roof,0.09
8,mary,0.08
9,competed,0.08


Unnamed: 0,token,similarity
0,position,0.58
1,usually,0.57
2,to,0.57
3,elected,0.55
4,weir,0.1
5,name,0.1
6,racing,0.09
7,near,0.08
8,measures,0.08
9,included,0.08


Unnamed: 0,token,similarity
0,of,0.5
1,the,0.46
2,national,0.27
3,and,0.27
4,position,0.26
5,obliged,0.25
6,was,0.25
7,organizer,0.24
8,she,0.23
9,resign,0.22


<ins>**Sentence:**</ins> Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,children,0.48
1,of,0.44
2,meetings,0.43
3,school,0.43
4,sunday,0.43
5,circular,0.09
6,unusual,0.09
7,pregnant,0.08
8,fell,0.08
9,uncle,0.08


Unnamed: 0,token,similarity
0,sunday,0.6
1,children,0.58
2,school,0.57
3,fell,0.1
4,team,0.09
5,leave,0.09
6,pregnant,0.08
7,25th,0.08
8,ridge,0.08
9,semi,0.08


Unnamed: 0,token,similarity
0,of,0.54
1,another,0.31
2,mass,0.31
3,organization,0.29
4,work,0.29
5,the,0.29
6,her,0.27
7,temperance,0.27
8,feature,0.27
9,was,0.26


Unnamed: 0,token,similarity
0,meetings,0.52
1,of,0.51
2,children,0.5
3,organization,0.48
4,park,0.08
5,both,0.08
6,member,0.08
7,surrendered,0.08
8,t,0.08
9,mills,0.07


Unnamed: 0,token,similarity
0,school,0.7
1,children,0.7
2,surrendered,0.09
3,gone,0.09
4,team,0.08
5,ranking,0.08
6,her,0.08
7,facilities,0.08
8,often,0.08
9,specialise,0.08


Unnamed: 0,token,similarity
0,her,0.45
1,another,0.45
2,work,0.44
3,feature,0.41
4,of,0.37
5,was,0.35
6,off,0.11
7,name,0.1
8,rtve,0.09
9,bottom,0.09


Unnamed: 0,token,similarity
0,usually,1.0
1,to,1.0
2,meetings,0.12
3,temperance,0.09
4,opened,0.09
5,significant,0.09
6,bid,0.09
7,cut,0.08
8,lives,0.08
9,park,0.08


In [14]:
get_results(in_sentences[:3], "pooling", True)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",the,0.69
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",in,0.34
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",of,0.33
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",and,0.29
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",a,0.26
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",usually,0.23
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",to,0.23
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",with,0.15
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",was,0.15
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",bruno,0.11


## Out-of-sample sentences
### W/ stop words in inference sentence
#### Closest addresses

In [15]:
get_results(out_sentences[:3], "top_k", False)

<ins>**Sentence:**</ins> As the population of all of the towns grew, the need for better transportation between them also grew.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,the,0.71
1,of,0.71
2,better,0.71
3,transportation,0.71
4,convex,0.09
5,fa,0.09
6,living,0.09
7,mary,0.08
8,roof,0.08
9,anti,0.08


Unnamed: 0,token,similarity
0,the,0.81
1,transportation,0.81
2,better,0.43
3,of,0.43
4,evolution,0.41
5,convex,0.1
6,fa,0.09
7,sliced,0.09
8,roof,0.08
9,living,0.08


Unnamed: 0,token,similarity
0,the,0.82
1,transportation,0.82
2,secretary,0.42
3,better,0.41
4,of,0.41
5,fa,0.1
6,11th,0.09
7,january,0.08
8,off,0.08
9,sliced,0.08


Unnamed: 0,token,similarity
0,transportation,0.75
1,the,0.75
2,in,0.39
3,six,0.39
4,as,0.39
5,of,0.38
6,better,0.38
7,convex,0.11
8,sliced,0.09
9,formal,0.09


Unnamed: 0,token,similarity
0,the,0.82
1,transportation,0.82
2,ussr,0.41
3,better,0.4
4,of,0.4
5,scheduled,0.09
6,fa,0.09
7,placed,0.08
8,living,0.08
9,reform,0.08


Unnamed: 0,token,similarity
0,better,0.89
1,of,0.89
2,the,0.45
3,transportation,0.45
4,mary,0.1
5,brazier,0.08
6,halfway,0.08
7,roof,0.08
8,mothers,0.08
9,living,0.08


Unnamed: 0,token,similarity
0,the,0.76
1,transportation,0.76
2,frame,0.38
3,better,0.38
4,of,0.38
5,goal,0.38
6,convex,0.1
7,roof,0.1
8,11th,0.1
9,during,0.09


<ins>**Sentence:**</ins> The construction of the line was the subject of a legal challenge.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,the,0.71
1,of,0.71
2,better,0.71
3,transportation,0.71
4,convex,0.09
5,fa,0.09
6,living,0.09
7,mary,0.08
8,roof,0.08
9,anti,0.08


Unnamed: 0,token,similarity
0,the,0.59
1,transportation,0.59
2,was,0.58
3,better,0.57
4,of,0.57
5,caroline,0.1
6,national,0.09
7,fails,0.09
8,roof,0.09
9,messages,0.08


Unnamed: 0,token,similarity
0,better,0.6
1,of,0.6
2,a,0.59
3,fatal,0.59
4,transportation,0.56
5,the,0.56
6,during,0.1
7,11th,0.08
8,should,0.08
9,national,0.08


Unnamed: 0,token,similarity
0,province,0.5
1,legal,0.5
2,construction,0.49
3,dutch,0.49
4,better,0.49
5,of,0.49
6,the,0.48
7,transportation,0.48
8,scheduled,0.1
9,sources,0.09


Unnamed: 0,token,similarity
0,the,0.81
1,transportation,0.81
2,better,0.43
3,of,0.43
4,evolution,0.41
5,convex,0.1
6,fa,0.09
7,sliced,0.09
8,roof,0.08
9,living,0.08


Unnamed: 0,token,similarity
0,the,0.82
1,transportation,0.82
2,secretary,0.42
3,better,0.41
4,of,0.41
5,fa,0.1
6,11th,0.09
7,january,0.08
8,off,0.08
9,sliced,0.08


Unnamed: 0,token,similarity
0,the,0.82
1,transportation,0.82
2,ussr,0.41
3,better,0.4
4,of,0.4
5,scheduled,0.09
6,fa,0.09
7,placed,0.08
8,living,0.08
9,reform,0.08


<ins>**Sentence:**</ins> The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,better,0.65
1,of,0.65
2,municipality,0.36
3,affect,0.36
4,is,0.32
5,extremes,0.32
6,transportation,0.32
7,the,0.32
8,millions,0.31
9,it,0.31


Unnamed: 0,token,similarity
0,better,0.46
1,of,0.46
2,is,0.45
3,extremes,0.45
4,part,0.45
5,could,0.45
6,fatal,0.44
7,a,0.44
8,millions,0.44
9,it,0.44


Unnamed: 0,token,similarity
0,better,0.48
1,of,0.48
2,municipality,0.47
3,affect,0.47
4,a,0.45
5,fatal,0.45
6,transportation,0.45
7,the,0.45
8,could,0.43
9,part,0.43


Unnamed: 0,token,similarity
0,better,0.53
1,of,0.53
2,municipality,0.51
3,affect,0.51
4,a,0.51
5,fatal,0.51
6,could,0.49
7,part,0.49
8,which,0.1
9,identify,0.09


Unnamed: 0,token,similarity
0,of,0.58
1,better,0.58
2,descent,0.57
3,people,0.55
4,whitish,0.1
5,varies,0.1
6,last,0.1
7,position,0.09
8,local,0.09
9,bottom,0.08


Unnamed: 0,token,similarity
0,legal,0.37
1,in,0.37
2,province,0.37
3,six,0.37
4,the,0.34
5,transportation,0.34
6,healthy,0.34
7,village,0.34
8,hours,0.34
9,groningen,0.34


Unnamed: 0,token,similarity
0,municipality,0.54
1,affect,0.54
2,of,0.51
3,better,0.51
4,the,0.51
5,transportation,0.51
6,could,0.48
7,part,0.48
8,convex,0.09
9,2011,0.08


#### Pooled address space

In [16]:
get_results(out_sentences[:3], "pooling", False)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",transportation,0.68
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",the,0.68
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",of,0.37
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",better,0.37
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",six,0.34
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",in,0.34
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",to,0.28
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",usually,0.28
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",a,0.26
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",fatal,0.26


### W/o stop words in inference sentence
#### Closest addresses

In [17]:
get_results(out_sentences[:3], "top_k", True)

<ins>**Sentence:**</ins> As the population of all of the towns grew, the need for better transportation between them also grew.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,towns,0.7
1,an,0.7
2,derivative,0.7
3,grew,0.7
4,capitol,0.1
5,so,0.09
6,straight,0.09
7,chelsea,0.09
8,riding,0.09
9,run,0.08


Unnamed: 0,token,similarity
0,formal,0.6
1,8,0.6
2,derivative,0.56
3,grew,0.56
4,population,0.55
5,car,0.55
6,brown,0.09
7,capitol,0.09
8,creative,0.09
9,went,0.09


Unnamed: 0,token,similarity
0,the,0.71
1,of,0.71
2,better,0.71
3,transportation,0.71
4,convex,0.09
5,fa,0.09
6,living,0.09
7,mary,0.08
8,roof,0.08
9,anti,0.08


Unnamed: 0,token,similarity
0,capitol,0.6
1,transportation,0.59
2,the,0.59
3,better,0.58
4,of,0.58
5,living,0.1
6,halfway,0.1
7,11th,0.09
8,co,0.09
9,roof,0.09


Unnamed: 0,token,similarity
0,the,0.81
1,transportation,0.81
2,better,0.43
3,of,0.43
4,evolution,0.41
5,convex,0.1
6,fa,0.09
7,sliced,0.09
8,roof,0.08
9,living,0.08


Unnamed: 0,token,similarity
0,better,0.89
1,of,0.89
2,the,0.45
3,transportation,0.45
4,mary,0.1
5,brazier,0.08
6,halfway,0.08
7,roof,0.08
8,mothers,0.08
9,living,0.08


Unnamed: 0,token,similarity
0,the,0.82
1,transportation,0.82
2,secretary,0.42
3,better,0.41
4,of,0.41
5,fa,0.1
6,11th,0.09
7,january,0.08
8,off,0.08
9,sliced,0.08


<ins>**Sentence:**</ins> The construction of the line was the subject of a legal challenge.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,dutch,1.0
1,construction,1.0
2,released,0.09
3,screenplay,0.09
4,sources,0.09
5,makes,0.09
6,1911,0.08
7,whorls,0.08
8,uses,0.08
9,dots,0.08


Unnamed: 0,token,similarity
0,province,0.5
1,legal,0.5
2,construction,0.49
3,dutch,0.49
4,better,0.49
5,of,0.49
6,the,0.48
7,transportation,0.48
8,scheduled,0.1
9,sources,0.09


Unnamed: 0,token,similarity
0,legal,0.47
1,province,0.47
2,village,0.46
3,healthy,0.46
4,dutch,0.46
5,construction,0.46
6,hours,0.43
7,groningen,0.43
8,of,0.41
9,better,0.41


Unnamed: 0,token,similarity
0,in,0.72
1,province,0.72
2,legal,0.72
3,six,0.72
4,member,0.11
5,mills,0.1
6,eventually,0.09
7,drawn,0.09
8,produced,0.09
9,acquired,0.08


Unnamed: 0,token,similarity
0,groningen,0.71
1,province,0.71
2,legal,0.71
3,hours,0.71
4,rising,0.09
5,anna,0.09
6,proclamation,0.09
7,matches,0.09
8,eventually,0.09
9,office,0.09


Unnamed: 0,token,similarity
0,construction,0.73
1,dutch,0.73
2,in,0.73
3,six,0.73
4,sun,0.11
5,took,0.09
6,move,0.08
7,career,0.08
8,knew,0.08
9,usually,0.08


Unnamed: 0,token,similarity
0,halfway,0.73
1,line,0.73
2,la,0.09
3,riding,0.08
4,cutting,0.08
5,italy,0.08
6,players,0.08
7,january,0.08
8,shape,0.08
9,media,0.07


<ins>**Sentence:**</ins> The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.

<ins>**Addresses:**</ins>

Unnamed: 0,token,similarity
0,people,1.0
1,hon,0.1
2,qualifying,0.09
3,capitol,0.09
4,perfect,0.09
5,which,0.09
6,2021,0.08
7,pageant,0.08
8,served,0.08
9,varies,0.08


Unnamed: 0,token,similarity
0,better,0.46
1,of,0.46
2,is,0.45
3,extremes,0.45
4,part,0.45
5,could,0.45
6,fatal,0.44
7,a,0.44
8,millions,0.44
9,it,0.44


Unnamed: 0,token,similarity
0,extremes,0.59
1,is,0.59
2,part,0.58
3,could,0.58
4,affect,0.58
5,municipality,0.58
6,italian,0.1
7,receiving,0.1
8,folklore,0.09
9,retaining,0.09


Unnamed: 0,token,similarity
0,a,0.58
1,fatal,0.58
2,healthy,0.57
3,village,0.57
4,extremes,0.56
5,is,0.56
6,hearth,0.1
7,production,0.1
8,rehearsals,0.09
9,outside,0.09


Unnamed: 0,token,similarity
0,municipality,0.58
1,affect,0.58
2,is,0.58
3,extremes,0.58
4,fatal,0.57
5,a,0.57
6,identify,0.1
7,standardized,0.09
8,carried,0.09
9,icaa,0.08


Unnamed: 0,token,similarity
0,part,0.59
1,could,0.59
2,is,0.58
3,extremes,0.58
4,fatal,0.57
5,a,0.57
6,folklore,0.1
7,not,0.1
8,outright,0.09
9,italian,0.09


Unnamed: 0,token,similarity
0,legal,0.37
1,in,0.37
2,province,0.37
3,six,0.37
4,the,0.34
5,transportation,0.34
6,healthy,0.34
7,village,0.34
8,hours,0.34
9,groningen,0.34


#### Pooled address space

In [18]:
get_results(out_sentences[:3], "pooling", True)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",transportation,0.65
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",the,0.65
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",in,0.39
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",six,0.39
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",a,0.32
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",fatal,0.32
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",better,0.31
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",of,0.31
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",and,0.29
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",to,0.24


In [19]:
# # Set retrieve mode.
# retrieve_mode = "top_k"

# # Get table with token similarities for each "out-of-train" sentence.
# retrieved_contents = inference.infer(
#     memory.address_size,
#     cleanup,
#     memory,
#     sentences,
#     retrieve_mode=retrieve_mode,
#     remove_stopwords=True,
#     k=7, 
# )


# sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 

# for s, addresses in zip(sentences, retrieved_contents):
#     display(s)
#     out_tables = []
#     for a in addresses:
#         address_sims_df = inference.get_similarities_to_atomic_set(
#             a, cleanup, k=11)
#         out = widgets.Output()
#         with out:
#             display(address_sims_df)
#         out_tables.append(out)
#     display(widgets.HBox(out_tables))

In [20]:
# retrieve_mode = "pooling"

# # Get table with token similarities for each "out-of-train" sentence.
# retrieved_contents = inference.infer(
#     memory.address_size,
#     cleanup,
#     memory,
#     sentences,
#     retrieve_mode=retrieve_mode,
#     remove_stopwords=True,
#     k=7, 
# )

# sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
      
# for s, c in zip(sentences, retrieved_contents):
#     sentence_sims_df = inference.get_similarities_to_atomic_set(
#         c, cleanup)
#     sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
#     sims_df = pd.concat([sims_df, sentence_sims_df])

# sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
#                  .set_index(['sentence', 'token'])

# display(sims_df)