In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
%%capture
import datasets

import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, utils 

import math
import numpy as np
import random

import pandas as pd
import pathlib
import pickle

import string


### Package options ###
pd.set_option('display.max_rows', 500)

In [3]:
# Set seed.
utils.fix_seed(41)

Using seed: 41

In [4]:
### Utils ###
def get_results(
    sentences: list[str],
    retrieve_mode: str,
    remove_stopwords_inference: bool = True
) -> None:
    
    # Retrieve content from memory.
    retrieved_contents = inference.infer(
        memory.address_size,
        cleanup,
        memory,
        sentences,
        retrieve_mode=retrieve_mode,
        remove_stopwords=remove_stopwords_inference,
        k=7, 
    )
    
    if retrieve_mode == "top_k":
        sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
    
        for s, addresses in zip(sentences, retrieved_contents):
            display(md(f"<ins>**Sentence:**</ins> {s}"))
            out_tables = []
            for a in addresses:
                address_sims_df = inference.get_similarities_to_atomic_set(
                    a, cleanup, k=11)
                out = widgets.Output()
                with out:
                    display(address_sims_df)
                out_tables.append(out)
            display(widgets.HBox(out_tables))
        return
    elif retrieve_mode == "pooling":  
        sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 

        for s, c in zip(sentences, retrieved_contents):
            sentence_sims_df = inference.get_similarities_to_atomic_set(
                c, cleanup)
            sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
            sims_df = pd.concat([sims_df, sentence_sims_df])

        sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
                         .set_index(['sentence', 'token'])

        display(sims_df)
        return
    else:  # unrecognized
        display(md("Unrecognized retrieval mode."))
        return

In [5]:
in_sentences = [
    """Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.""",
    """In 1910, she was elected to the position of organizer and lecturer of the National WCTU.""",
    """Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.""",
    """With all other games played, a victory over Everton had put United top of the group on nine points.""",
    """The 2022 FA Women's League Cup Final was the 11th final of the FA Women's League Cup, England's secondary cup competition for women's football teams and its primary league cup tournament.""",
    """In 2020 Mico's single 'igare' awarded as the best song of the summer in Kiss Summer Awards.""",
    """She collected the speech and words of Dublin city and donated her collection to the Department of Irish Folklore at University College, Dublin.""",
    """Traditional palyanytsya was baked from yeast dough.""",
   """First, hops were boiled in a pot, which was then poured into a makitra, to which sifted wheat flour was added.""",
     """ Jonathan Holland of ScreenDaily deemed the film to be "superbly directed by Palomero, who seems to have a special gift for seeing the world through children's eyes." """   
]

In [6]:
out_sentences = [
    """As the population of all of the towns grew, the need for better transportation between them also grew.""",
    """The construction of the line was the subject of a legal challenge.""",
    """The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.""",
    """Dangerous heat stress events will spread rapidly across the world as global heating continues.""",
    """Whether or not history will determine that we are living in an ever more divided culture, it certainly feels that way."""
]

In [7]:
# Parametrized cell.
filename = "2023-09-10 13-01-52-622408.pkl"
experiment_no = 1

In [8]:
# Load memory and associated cleanup.
cleanup = pickle.load(open(f"cleanups/method2/cleanup_{filename}",'rb'))
memory = pickle.load(open(f"memories/method2/memory_{filename}",'rb'))

In [9]:
display(md(f"# Mining Transfomer attention - Experiment {experiment_no}"))

# Mining Transfomer attention - Experiment 1

## Memory visualiztion
### Statistics

In [10]:
display(md(f"Number of trained articles: {len(memory.wiki_articles)}"))
display(md(f"Number of memory updates: {memory.n_updates}"))
display(md(f"Number of memory expansions: {memory.n_expansions}"))
display(md(f"Updates percentage: {round(memory.n_updates / (memory.n_updates + memory.n_expansions), 3)}%"))
display(md(f"Number of existing memory addresses: {len(memory.addresses)}"))

Number of trained articles: 5010

Number of memory updates: 234448

Number of memory expansions: 136896

Updates percentage: 0.631%

Number of existing memory addresses: 136896

### Addresses

In [11]:
# addresses = np.random.randint(0, len(memory.addresses), size=30)

# for address in addresses:
#     display(md(f"### <ins>Address {address}</ins>"))
#     display(md(f"Address **chunk score:** {memory.scores[address][0]}, **bin score:** {memory.scores[address][1]}"))
#     address_sims_df = inference.get_similarities_to_atomic_set(
#             memory.addresses[address],
#             cleanup,
#     )
#     display(address_sims_df)

## In-sample sentences
### W/ stop words in inference sentence
#### Closest addresses

In [12]:
get_results(in_sentences[:3], "top_k", False)

<ins>**Sentence:**</ins> Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> In 1910, she was elected to the position of organizer and lecturer of the National WCTU.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

#### Pooled address space

In [13]:
get_results(in_sentences[:3], "pooling", False)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",the,0.6
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",of,0.51
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",to,0.26
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",wctu,0.26
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",in,0.24
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",and,0.2
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",was,0.19
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",a,0.16
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",claiming,0.12
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",diagnosed,0.12


### W/o stop words in inference sentence

In [14]:
get_results(in_sentences[:3], "top_k", True)

<ins>**Sentence:**</ins> Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> In 1910, she was elected to the position of organizer and lecturer of the National WCTU.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

In [15]:
get_results(in_sentences[:3], "pooling", True)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",the,0.47
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",of,0.4
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",to,0.39
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",wctu,0.39
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",in,0.24
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",a,0.22
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",and,0.2
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",was,0.18
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",is,0.15
"In 1910, she was elected to the position of organizer and lecturer of the National WCTU.",on,0.14


## Out-of-sample sentences
### W/ stop words in inference sentence
#### Closest addresses

In [16]:
get_results(out_sentences[:3], "top_k", False)

<ins>**Sentence:**</ins> As the population of all of the towns grew, the need for better transportation between them also grew.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> The construction of the line was the subject of a legal challenge.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

#### Pooled address space

In [17]:
get_results(out_sentences[:3], "pooling", False)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",of,0.53
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",the,0.5
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",to,0.29
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",wctu,0.29
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",a,0.27
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",extremes,0.27
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",in,0.21
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",and,0.18
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",was,0.17
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",an,0.14


### W/o stop words in inference sentence
#### Closest addresses

In [18]:
get_results(out_sentences[:3], "top_k", True)

<ins>**Sentence:**</ins> As the population of all of the towns grew, the need for better transportation between them also grew.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> The construction of the line was the subject of a legal challenge.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

<ins>**Sentence:**</ins> The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

#### Pooled address space

In [19]:
get_results(out_sentences[:3], "pooling", True)

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",the,0.47
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",of,0.42
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",a,0.33
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",extremes,0.33
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",in,0.26
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",to,0.21
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",wctu,0.21
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",and,0.2
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",was,0.2
"The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.",an,0.16


In [20]:
# # Set retrieve mode.
# retrieve_mode = "top_k"

# # Get table with token similarities for each "out-of-train" sentence.
# retrieved_contents = inference.infer(
#     memory.address_size,
#     cleanup,
#     memory,
#     sentences,
#     retrieve_mode=retrieve_mode,
#     remove_stopwords=True,
#     k=7, 
# )


# sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 

# for s, addresses in zip(sentences, retrieved_contents):
#     display(s)
#     out_tables = []
#     for a in addresses:
#         address_sims_df = inference.get_similarities_to_atomic_set(
#             a, cleanup, k=11)
#         out = widgets.Output()
#         with out:
#             display(address_sims_df)
#         out_tables.append(out)
#     display(widgets.HBox(out_tables))

In [21]:
# retrieve_mode = "pooling"

# # Get table with token similarities for each "out-of-train" sentence.
# retrieved_contents = inference.infer(
#     memory.address_size,
#     cleanup,
#     memory,
#     sentences,
#     retrieve_mode=retrieve_mode,
#     remove_stopwords=True,
#     k=7, 
# )

# sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
      
# for s, c in zip(sentences, retrieved_contents):
#     sentence_sims_df = inference.get_similarities_to_atomic_set(
#         c, cleanup)
#     sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
#     sims_df = pd.concat([sims_df, sentence_sims_df])

# sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
#                  .set_index(['sentence', 'token'])

# display(sims_df)