In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
import datasets

from datetime import datetime
import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
from nltk.corpus import stopwords
import numpy as np
import random

import pandas as pd
import pathlib
import pickle

import string
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm

### Package options ###
torch.set_printoptions(threshold=10_000)

[nltk_data] Downloading package punkt to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
### Utils ###
def plot_heatmap(x: np.array, labels: np.array) -> None:
    plt.figure(figsize=(15, 15))
    sns.heatmap(
        x,
        linewidth=0.5,
        xticklabels=labels,
        yticklabels=labels,
        annot=True,
        fmt='.2f',
    )
    plt.title(f'Self-attention matrix: layer {layer}, head {head}', fontsize=15)
    
    plt.show()
    return

def average_out_and_remove_rows(t: torch.tensor, averages_idx, remove_idx):
    for average_idx in averages_idx:  # The nested lists can have different dimensions.
        # Replace the attention scores of the first token with the average of the token attention scores.
        t[min(average_idx)] = torch.mean(t[average_idx], dim=0, keepdim=True)
    return t[~remove_idx]


def preprocess_attention_scores(attention_scores, averages_idx, remove_idx):
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    attention_scores = attention_scores.transpose(0, 1)
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    return attention_scores.transpose(0, 1)
        
    

def backward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    in_nodes = np.array([edge[0] for edge in list(G.in_edges(current_node))])
    in_nodes = in_nodes[(in_nodes > left_edge) & (in_nodes < current_node)]
    for node in in_nodes:
        sequence[node] = 1
        sequences.append(sequence)
        mean += G[node][current_node]['weight']
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, left_edge, node, sequence.copy(), mean)
        forward_pass(G, node, left_edge, current_node, sequence.copy(), mean)
        
    return
    
    
def forward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    out_nodes = np.array([edge[1] for edge in list(G.out_edges(current_node))])
    out_nodes = out_nodes[(out_nodes > current_node) & (out_nodes < right_edge)]
    for node in out_nodes:
        sequence[node] = 1
        mean += G[current_node][node]['weight']
        sequences.append(sequence)
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, current_node, node, sequence.copy(), mean)
        forward_pass(G, node, node, right_edge, sequence.copy(), mean)
            
    return
    

def construct_sequences(G: nx.DiGraph, n_tokens):
    for node in G.nodes():
        sequence = np.zeros(n_tokens)
        mean = 0
        sequence[node] = 1
        #sequences.append(sequence) # Do not allow for 1-token sequences.
        forward_pass(G, node, node, n_tokens, sequence.copy(), mean)

In [4]:
def save_memory(cleanup, memory):
    now = str(datetime.now()).replace(':', "-").replace('.', '-')
    
    if not os.path.exists('memories/method2'):
        os.makedirs('memories/method2')
    if not os.path.exists('cleanups/method2'):
        os.makedirs('cleanups/method2')
        
    with open(f'memories/method2/memory_{now}.pkl', 'wb') as outp:
        pickle.dump(memory, outp, pickle.HIGHEST_PROTOCOL)
    with open(f'cleanups/method2/cleanup_{now}.pkl', 'wb') as outp:
        pickle.dump(cleanup, outp, pickle.HIGHEST_PROTOCOL)

In [5]:
# Load Wikipedia dataset.
# TODO: Split between server and local.
#wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']
wiki_dataset = datasets.load_dataset(
    "wikipedia",
    "20220301.en",
    cache_dir="/nfs/data/projects/daniela")['train']

Found cached dataset wikipedia (/nfs/data/projects/daniela/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seed.
utils.fix_seed(41)

Using seed: 41

In [7]:
# These parameters shouldn't change.
address_size = 1000
ema_time_period = 100000
learning_rate_update = 0

normalize = False 

as_threshold = 0.5


temperature = 0.05

prune_mode = None
max_size_address_space = 10

safeguard_bins=True
bin_score_threshold_type = 'static'
bin_score_threshold = 1e-8

safeguard_chunks=True
chunk_score_threshold = 0.8

In [8]:
cleanup = cleanup.Cleanup(address_size)

In [9]:
model_name = "bert-base-uncased"  # Has 12 layers
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

MAXIMUM_SEQUENCE_LENGTH = 512

In [10]:
memory = DSDM.DSDM(
    address_size=address_size,
    ema_time_period=ema_time_period,
    learning_rate_update=learning_rate_update,
    temperature=temperature,
    normalize=normalize,
    prune_mode=prune_mode,
    max_size_address_space=max_size_address_space,
    safeguard_bins=safeguard_bins,
    bin_score_threshold_type=bin_score_threshold_type,
    bin_score_threshold=bin_score_threshold,
    safeguard_chunks=safeguard_chunks,
    chunk_score_threshold=chunk_score_threshold,
)

In [11]:
# Initialize memory.
# memory = DSDM.DSDM(
#     address_size=address_size,
#     ema_time_period=ema_time_period,
#     learning_rate_update=learning_rate_update,
#     temperature=temperature,
#     normalize=normalize,
#     prune_mode=prune_mode,
#     max_size_address_space=max_size_address_space,
#     bin_score_threshold=bin_score_threshold,
#     chunk_score_threshold=chunk_score_threshold,
# ) 

In [12]:
# Construct train set (texts) and inference set (sentences; in and out of train set text).
# train_size = 10
# test_size = 10

# # Text indeces.
# train_idx = np.random.randint(0, len(wiki_dataset), size=train_size)
# #train_idx = np.append(np.append(np.append(train_idx[0], train_idx[0]), train_idx[0]), train_idx[0]) 

# # Caclulate chosen text statistics.
# # TODO

# # Text indeces from which we extract sentences.
# intest_idx = np.random.choice(train_idx, test_size)
# outtest_idx = np.random.choice(np.setdiff1d(np.arange(len(wiki_dataset)), train_idx), test_size)

In [13]:
# inference_sentences_in = []
# inference_sentences_out = []

# for idx_in, idx_out in zip(intest_idx, outtest_idx):
#     # Get sentences.
#     sentences_in = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_in)]['text'])
#     sentences_out = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_out)]['text'])
    
#     # Get sentence index.
#     sentence_idx_in = np.random.randint(0, len(sentences_in), size=1).item()
#     sentence_idx_out = np.random.randint(0, len(sentences_out), size=1).item()

#     # Append sentence to list.
#     inference_sentences_in.append(sentences_in[sentence_idx_in])
#     inference_sentences_out.append(sentences_out[sentence_idx_out])

In [14]:
train_size = 2

train_idx = np.random.randint(0, len(wiki_dataset) - 1000, size=1000000)
train_idx = train_idx[:train_size]
train_idx = np.append(np.array([6458629, 6458633, 6458645, 6458648, 6458659, 6458664, 6458665,
   6458667, 6458668, 6458573]), train_idx)

In [15]:
dups_found = 0

def remove_duplicates(memory):
    global dups_found
    global_keep_mask = torch.tensor([True] * len(memory.addresses)).to(device)
    
    for idx, address in enumerate(memory.addresses):
        if global_keep_mask[idx].item():
            cos = torch.nn.CosineSimilarity()
            keep_mask = cos(memory.addresses, address) < 0.95
            # Keep current address
            keep_mask[idx] = True
            global_keep_mask &= keep_mask

    if global_keep_mask.sum().item() > 0:
        dups_found += 1
        # Remove similar addresses
        memory.addresses = memory.addresses[global_keep_mask]
        # Remove bins
        memory.bins = memory.bins[global_keep_mask]
        # Remove chunk scores
        memory.chunk_scores = memory.chunk_scores[global_keep_mask]

In [16]:
# Training
for pos, i in enumerate(tqdm(train_idx)):
    memory.add_wiki_article(int(i))
    text = wiki_dataset[int(i)]['text']
    
    # Preprocess data. 
    sentences = preprocess.split_text_into_sentences(text)
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt")
        if inputs['input_ids'].shape[1] > MAXIMUM_SEQUENCE_LENGTH:
            break
        
        outputs = model(**inputs, output_attentions=True)
        attention_matrix = outputs.attentions
        
        encoding = tokenizer.encode(sentence)
        labels = tokenizer.convert_ids_to_tokens(encoding)

        i = 0
        averages_idx = []
        while i < len(labels) - 1:
            j = i + 1
            average_idx = []
            while labels[j].startswith('#'):
                average_idx.append(j)
                labels[i] += labels[j].replace('#', '')
                j += 1
            if average_idx != []:
                average_idx.append(i)
                averages_idx.append(average_idx)
            i = j
        
        hashtag_idx = np.array([label.startswith("#") for label in labels])
        stopwords_idx = np.array([label in stopwords.words('english') for label in labels])
        punctuation_idx = np.array([label in string.punctuation for label in labels])
        dash_idx = np.array([(len(label) == 1 and ord(label) == 8211) for label in labels])
        remove_idx = hashtag_idx | punctuation_idx | dash_idx | stopwords_idx
        labels = np.array(labels)[~remove_idx]
        labels = labels[1:(len(labels) - 1)]

        layer = 0
        for layer in range(12):
            for head in range(12):
                head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()

                head_scores_raw_tensor = preprocess_attention_scores(head_scores_raw_tensor, averages_idx, remove_idx)

                head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()

                head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()

                head_scores[head_scores < as_threshold] = 0

                G = nx.from_numpy_array(head_scores, create_using=nx.DiGraph())

                sequences = []
                means = []
                n_tokens = len(labels)
                construct_sequences(G, n_tokens)

                df = pd.DataFrame(data=[sequences, means]).T.rename(columns={0: 'seq',  1: 'score'})
                    
                if len(df) > 0:
                    df['len'] = df['seq'].map(sum)
                    df['score'] = df['score'].astype('float64')
                    df = df.sort_values(by=['len', 'score'], ascending=[False, False]).reset_index(drop=True)
                    top3_df = df.head(1)

                    for i in range(len(top3_df)):
                        memory.save(
                            inference.generate_query(
                                address_size,
                                cleanup,
                                labels[top3_df['seq'][i].astype(bool)]
                            ),
                            top3_df['score'][i]
                        )
        memory.prune()
#     if (pos + 1) % 50 == 0:
#         remove_duplicates(memory)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:47<00:00,  3.95s/it]


In [17]:
#save_memory(cleanup, memory)

In [18]:
inference_sentences_in = [
    """Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.""",
    """In 1910, she was elected to the position of organizer and lecturer of the National WCTU.""",
    """Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.""",
    """With all other games played, a victory over Everton had put United top of the group on nine points.""",
    """The 2022 FA Women's League Cup Final was the 11th final of the FA Women's League Cup, England's secondary cup competition for women's football teams and its primary league cup tournament.""",
    """In 2020 Mico's single 'igare' awarded as the best song of the summer in Kiss Summer Awards.""",
    """She collected the speech and words of Dublin city and donated her collection to the Department of Irish Folklore at University College, Dublin.""",
    """Traditional palyanytsya was baked from yeast dough.""",
   """First, hops were boiled in a pot, which was then poured into a makitra, to which sifted wheat flour was added.""",
     """ Jonathan Holland of ScreenDaily deemed the film to be "superbly directed by Palomero, who seems to have a special gift for seeing the world through children's eyes." """   
]

In [19]:
retrieve_mode = "top_k"

# Get table with token similarities for each "out-of-train" sentence.
retrieved_contents = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    inference_sentences_in,
    retrieve_mode=retrieve_mode,
    k=7, #TODO: What if index is out of range?
)

if retrieve_mode == "top_k":
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
    
    for s, addresses in zip(inference_sentences_in, retrieved_contents):
        display(s)
        out_tables = []
        for a in addresses:
            address_sims_df = inference.get_similarities_to_atomic_set(
                a, cleanup)
            out = widgets.Output()
            with out:
                display(address_sims_df)
            out_tables.append(out)
        display(widgets.HBox(out_tables))
elif retrieve_mode == "pooling":  
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
      
    for s, c in zip(inference_sentences_in, retrieved_contents):
        sentence_sims_df = inference.get_similarities_to_atomic_set(
            c, cleanup)
        sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
        sims_df = pd.concat([sims_df, sentence_sims_df])

    sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
                     .set_index(['sentence', 'token'])
    
    display(sims_df)
else:  # unrecognized
    pass

'Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

'In 1910, she was elected to the position of organizer and lecturer of the National WCTU.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

'Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

'With all other games played, a victory over Everton had put United top of the group on nine points.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

"The 2022 FA Women's League Cup Final was the 11th final of the FA Women's League Cup, England's secondary cup competition for women's football teams and its primary league cup tournament."

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

"In 2020 Mico's single 'igare' awarded as the best song of the summer in Kiss Summer Awards."

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

'She collected the speech and words of Dublin city and donated her collection to the Department of Irish Folklore at University College, Dublin.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

'Traditional palyanytsya was baked from yeast dough.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

'First, hops were boiled in a pot, which was then poured into a makitra, to which sifted wheat flour was added.'

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

' Jonathan Holland of ScreenDaily deemed the film to be "superbly directed by Palomero, who seems to have a special gift for seeing the world through children\'s eyes." '

HBox(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output()))

In [20]:
len(memory.addresses)

1331

In [21]:
memory.n_expansions

1331

In [22]:
memory.n_updates

1382

In [23]:
#memory.prune()

#addresses = [283, 478, 380, 359, 382, 354]
addresses = np.random.randint(0, len(memory.addresses), size=30)

for address in addresses:
    display(md(f"### Address {address}"))
    address_sims_df = inference.get_similarities_to_atomic_set(
            memory.addresses[address],
            cleanup,
    )
    display(address_sims_df)

### Address 114

Unnamed: 0,token,similarity
0,temperance,0.72
1,general,0.72
2,palomero,0.12
3,debut,0.1
4,life,0.09
5,2004,0.09
6,previously,0.08
7,common,0.08
8,cinematographer,0.08
9,manager,0.08


### Address 190

Unnamed: 0,token,similarity
0,pageant,0.7
1,author,0.7
2,her,0.1
3,broke,0.09
4,wins,0.09
5,competed,0.09
6,rewarded,0.08
7,which,0.08
8,february,0.08
9,front,0.08


### Address 1017

Unnamed: 0,token,similarity
0,time,0.72
1,asking,0.72
2,players,0.12
3,belief,0.1
4,marched,0.09
5,ranking,0.09
6,links,0.08
7,town,0.08
8,former,0.08
9,ecuadorian,0.08


### Address 1047

Unnamed: 0,token,similarity
0,finals,0.69
1,semi,0.69
2,representing,0.1
3,pregnant,0.09
4,bhradaigh,0.09
5,main,0.08
6,past,0.08
7,dublin,0.08
8,london,0.08
9,station,0.08


### Address 1301

Unnamed: 0,token,similarity
0,base,0.7
1,outer,0.7
2,elongated,0.1
3,gift,0.1
4,go,0.09
5,births,0.09
6,involving,0.08
7,production,0.08
8,makers,0.08
9,leaf,0.08


### Address 1321

Unnamed: 0,token,similarity
0,record,0.7
1,labels,0.7
2,early,0.1
3,membership,0.08
4,baker,0.08
5,dictionary,0.08
6,division,0.08
7,entities,0.08
8,northern,0.08
9,included,0.08


### Address 32

Unnamed: 0,token,similarity
0,village,0.7
1,church,0.7
2,crust,0.1
3,language,0.09
4,national,0.09
5,far,0.09
6,february,0.08
7,land,0.08
8,bottom,0.08
9,frequent,0.08


### Address 1017

Unnamed: 0,token,similarity
0,time,0.72
1,asking,0.72
2,players,0.12
3,belief,0.1
4,marched,0.09
5,ranking,0.09
6,links,0.08
7,town,0.08
8,former,0.08
9,ecuadorian,0.08


### Address 1279

Unnamed: 0,token,similarity
0,reddish,0.68
1,brown,0.68
2,teams,0.12
3,tennessee,0.09
4,bedford,0.08
5,tasmania,0.08
6,net,0.08
7,history,0.07
8,hour,0.07
9,saw,0.07


### Address 528

Unnamed: 0,token,similarity
0,ц,0.7
1,replaces,0.7
2,double,0.1
3,music,0.1
4,connemara,0.1
5,spirit,0.09
6,intention,0.09
7,god,0.09
8,reacting,0.08
9,area,0.08


### Address 127

Unnamed: 0,token,similarity
0,elected,0.7
1,position,0.7
2,brothers,0.1
3,coffee,0.1
4,usl,0.09
5,wins,0.08
6,alcoholism,0.08
7,weir,0.08
8,province,0.08
9,like,0.08


### Address 1124

Unnamed: 0,token,similarity
0,postponed,0.58
1,december,0.55
2,following,0.55
3,burn,0.11
4,tinged,0.11
5,beach,0.09
6,body,0.09
7,70th,0.09
8,without,0.09
9,9,0.09


### Address 70

Unnamed: 0,token,similarity
0,york,0.71
1,1860,0.71
2,kerr,0.1
3,julian,0.1
4,bay,0.09
5,200,0.08
6,legion,0.08
7,costume,0.08
8,gift,0.08
9,coffee,0.08


### Address 1117

Unnamed: 0,token,similarity
0,ham,0.7
1,west,0.7
2,new,0.12
3,sam,0.09
4,teenage,0.08
5,share,0.08
6,postponed,0.08
7,were,0.08
8,woman,0.08
9,baking,0.08


### Address 1039

Unnamed: 0,token,similarity
0,dominance,0.71
1,rewarded,0.71
2,committee,0.11
3,fifth,0.1
4,label,0.1
5,net,0.1
6,also,0.09
7,palomero,0.09
8,sales,0.08
9,traffic,0.08


### Address 975

Unnamed: 0,token,similarity
0,debuts,0.7
1,five,0.7
2,longitudinal,0.1
3,take,0.1
4,railway,0.1
5,open,0.09
6,republic,0.09
7,minutes,0.09
8,boys,0.08
9,academy,0.08


### Address 480

Unnamed: 0,token,similarity
0,cooled,0.7
1,dough,0.7
2,baker,0.1
3,soccer,0.1
4,bay,0.09
5,right,0.09
6,blew,0.08
7,manager,0.08
8,well,0.08
9,parade,0.08


### Address 698

Unnamed: 0,token,similarity
0,final,0.59
1,league,0.57
2,cup,0.57
3,winter,0.11
4,dominance,0.09
5,saw,0.09
6,held,0.09
7,hundreds,0.08
8,79,0.08
9,constitution,0.08


### Address 963

Unnamed: 0,token,similarity
0,leicester,0.6
1,promoted,0.6
2,newly,0.56
3,of,0.11
4,personal,0.1
5,who,0.09
6,2,0.09
7,header,0.09
8,foreign,0.08
9,speech,0.08


### Address 891

Unnamed: 0,token,similarity
0,ball,0.69
1,feed,0.69
2,union,0.1
3,late,0.09
4,fails,0.08
5,organizer,0.08
6,third,0.08
7,member,0.08
8,rise,0.08
9,fleming,0.08


### Address 654

Unnamed: 0,token,similarity
0,musician,0.71
1,professionally,0.71
2,refer,0.12
3,tend,0.09
4,popular,0.08
5,wctu,0.08
6,open,0.08
7,through,0.08
8,line,0.07
9,liverpool,0.07


### Address 1237

Unnamed: 0,token,similarity
0,cards,0.71
1,yellow,0.71
2,later,0.13
3,shot,0.09
4,3,0.09
5,additional,0.08
6,paaltjasker,0.08
7,previously,0.08
8,russo,0.08
9,1913,0.08


### Address 837

Unnamed: 0,token,similarity
0,championship,0.51
1,much,0.5
2,surrendered,0.49
3,side,0.49
4,1913,0.11
5,stages,0.1
6,visit,0.09
7,30,0.09
8,louis,0.09
9,ridge,0.09


### Address 605

Unnamed: 0,token,similarity
0,red,0.68
1,wolves,0.68
2,палити,0.09
3,way,0.09
4,two,0.08
5,leicester,0.08
6,releases,0.08
7,snails,0.08
8,miss,0.08
9,late,0.07


### Address 412

Unnamed: 0,token,similarity
0,shooting,0.52
1,los,0.5
2,locations,0.49
3,included,0.48
4,calmly,0.1
5,11,0.09
6,author,0.09
7,screenplay,0.08
8,club,0.08
9,soccer,0.08


### Address 898

Unnamed: 0,token,similarity
0,ball,0.7
1,shot,0.7
2,successful,0.11
3,cards,0.09
4,due,0.09
5,losing,0.08
6,other,0.08
7,showing,0.08
8,surrendered,0.08
9,charlton,0.08


### Address 1106

Unnamed: 0,token,similarity
0,reach,0.7
1,final,0.7
2,word,0.1
3,1910,0.09
4,pivotal,0.09
5,invasion,0.08
6,previous,0.08
7,seeing,0.08
8,greenwood,0.08
9,winning,0.08


### Address 262

Unnamed: 0,token,similarity
0,20th,0.71
1,century,0.71
2,fernandez,0.09
3,scoring,0.09
4,rhomboidal,0.09
5,places,0.09
6,recitations,0.08
7,rolling,0.08
8,civil,0.08
9,representing,0.08


### Address 219

Unnamed: 0,token,similarity
0,professional,0.7
1,assistance,0.7
2,outer,0.1
3,load,0.09
4,russian,0.09
5,ukraine,0.08
6,society,0.08
7,used,0.08
8,mills,0.08
9,for,0.08


### Address 314

Unnamed: 0,token,similarity
0,school,0.48
1,high,0.47
2,st,0.45
3,attended,0.44
4,louis,0.43
5,fernandez,0.1
6,makitra,0.09
7,1998,0.09
8,marlborough,0.09
9,8,0.09


In [24]:
# import gensim.downloader as api
# from sklearn.manifold import TSNE

In [25]:
#Load pre-trained word embeddings (Word2Vec in this example)
# word_vectors = api.load("word2vec-google-news-300")

In [26]:
# %%capture
# address_embeddings = []
# address_concepts = []
# addresses = []
# bins = []
# chunk_scores = []

# for idx, address in enumerate(memory.addresses):
#     tokens = inference.get_most_similar_HVs(inference.get_similarities_to_atomic_set(address, cleanup))
#     embeddings = [word_vectors[word] for word in tokens if word in word_vectors]
#     if embeddings:
#         addresses.append(idx)
#         bins.append(memory.scores[idx, 1].item())
#         chunk_scores.append(memory.scores[idx, 0].item())
#         address_concepts.append(" ".join(tokens))
#         address_embeddings.append(sum(embeddings) / len(embeddings))

In [27]:
# reduced_embeddings = TSNE(n_components=2, random_state=42, perplexity=2).fit_transform(np.array(address_embeddings))

# df = pd.DataFrame(reduced_embeddings, columns=["Dimension 1", "Dimension 2"])
# df["Address"] = addresses
# df["Chunk"] = address_concepts
# df['Bin'] = bins
# df['Chunk-score'] = chunk_scores

In [28]:
# import plotly.express as px

# fig = px.scatter(
#     df, x="Dimension 1", y="Dimension 2",
#     text="Chunk", hover_data=["Address", "Bin", "Chunk-score"],
#     title="Memory concepts"
# )
# fig.show()

In [29]:
#inference_sentences_in = ["The Society convenes an annual conference, in locations across the United States and in Canada, usually in June, to convey the James Alice award."]
#inference_sentences_in = ["Deputy director flys to the United States."]
#inference_sentences_in = []

## Appendix

In [30]:
inference_sentences_out[21]

NameError: name 'inference_sentences_out' is not defined

In [None]:
#text = """In a letter to Tennessee military governor Andrew Johnson encouraging him to lead the way in raising black troops, Lincoln wrote, "The bare sight of fifty thousand armed, and drilled black soldiers on the banks of the Mississippi would end the rebellion at once". """
text = inference_sentences_out[21]
inputs = tokenizer(text, return_tensors="pt")

In [None]:
outputs = model(**inputs, output_attentions=True)
attention_matrix = outputs.attentions

In [None]:
encoding = tokenizer.encode(text)
labels = tokenizer.convert_ids_to_tokens(encoding)

In [None]:
i = 0
averages_idx = []
while i < len(labels) - 1:
    j = i + 1
    average_idx = []
    while labels[j].startswith('#'):
        average_idx.append(j)
        labels[i] += labels[j].replace('#', '')
        j += 1
    if average_idx != []:
        average_idx.append(i)
        averages_idx.append(average_idx)
    i = j

hashtag_idx = np.array([label.startswith("#") for label in labels])
stopwords_idx = np.array([label in stopwords.words('english') for label in labels])
punctuation_idx = np.array([label in string.punctuation for label in labels])
remove_idx = hashtag_idx | punctuation_idx | stopwords_idx
labels = np.array(labels)[~remove_idx]
labels = labels[1:(len(labels) - 1)]
print(labels)

In [None]:
layer = 0

for head in range(12):
    head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()
    
    head_scores_raw_tensor = preprocess_attention_scores(
        head_scores_raw_tensor, averages_idx, remove_idx
    )
        
    head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()
    
    head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()

    as_threshold = 0.4
    head_scores[head_scores < as_threshold] = 0
    plot_heatmap(head_scores, labels)
    
    G = nx.from_numpy_array(head_scores, create_using = nx.DiGraph())
    G.edges.data()

    sequences = []
    means= []
    n_tokens = len(labels)
    construct_sequences(G, n_tokens)
        
    df = pd.DataFrame(data=[sequences, means]).T.rename(columns={0: 'seq',  1: 'score'})
    if len(df) > 0:
        df['len'] = df['seq'].map(sum)
        df['score'] = df['score'].astype('float64')
        df = df.sort_values(by=['score', 'len'], ascending=[False, False]).reset_index(drop=True)
        top3_df = df.head(3)
        display(df)
    
        for i in range(len(top3_df)):
            print(labels[top3_df['seq'][i].astype(bool)], top3_df['score'][i])
    
    #if sequences != []:
        #layer_sequences.append(sequences)
#     if sequences != []:
#         print(head)
#         for seq in sequences:
#             print(labels[seq.astype(bool)])

In [None]:
# text = "Firenze firenze"
# encoding = tokenizer.encode(text)
# labels = tokenizer.convert_ids_to_tokens(encoding)

In [None]:
# i = 0
# averages_idx = []
# while i < len(labels) - 1:
#     j = i + 1
#     average_idx = []
#     while labels[j].startswith('#'):
#         average_idx.append(j)
#         labels[i] += labels[j].replace('#', '')
#         j += 1
#     if average_idx != []:
#         average_idx.append(i)
#         averages_idx.append(average_idx)
#     i = j

# hashtag_idx = np.array([label.startswith("#") for label in labels])
# labels = np.array(labels)[~hashtag_idx]

In [None]:
# Torch implementation.

# t = torch.tensor(head_scores_raw)
# i = torch.tensor(averages_idx)

# t[i] = torch.mean(t[i], dim=1, keepdim=True)
# t = torch.unique_consecutive(t, dim=0)
# t = torch.transpose(t, 0, 1)
# t[i] = torch.mean(t[i], dim=1, keepdim=True)
# t = torch.unique_consecutive(t, dim=0)

# t = torch.transpose(t, 0, 1)