In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
import datasets

import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
from nltk.corpus import stopwords
import numpy as np
import random

import pandas as pd
import pathlib

import string
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm

### Package options ###
torch.set_printoptions(threshold=10_000)

[nltk_data] Downloading package punkt to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
### Utils ###
def plot_heatmap(x: np.array, labels: np.array) -> None:
    plt.figure(figsize=(15, 15))
    sns.heatmap(
        x,
        linewidth=0.5,
        xticklabels=labels,
        yticklabels=labels,
        annot=True,
        fmt='.2f',
    )
    plt.title(f'Self-attention matrix: layer {layer}, head {head}', fontsize=15)
    
    plt.show()
    return

def average_out_and_remove_rows(t: torch.tensor, averages_idx, remove_idx):
    for average_idx in averages_idx:  # The nested lists can have different dimensions.
        # Replace the attention scores of the first token with the average of the token attention scores.
        t[min(average_idx)] = torch.mean(t[average_idx], dim=0, keepdim=True)
    return t[~remove_idx]


def preprocess_attention_scores(attention_scores, averages_idx, remove_idx):
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    attention_scores = attention_scores.transpose(0, 1)
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    return attention_scores.transpose(0, 1)
        
    

def backward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    in_nodes = np.array([edge[0] for edge in list(G.in_edges(current_node))])
    in_nodes = in_nodes[(in_nodes > left_edge) & (in_nodes < current_node)]
    for node in in_nodes:
        sequence[node] = 1
        sequences.append(sequence)
        mean += G[node][current_node]['weight']
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, left_edge, node, sequence.copy(), mean)
        forward_pass(G, node, left_edge, current_node, sequence.copy(), mean)
        
    return
    
    
def forward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    out_nodes = np.array([edge[1] for edge in list(G.out_edges(current_node))])
    out_nodes = out_nodes[(out_nodes > current_node) & (out_nodes < right_edge)]
    for node in out_nodes:
        sequence[node] = 1
        mean += G[current_node][node]['weight']
        sequences.append(sequence)
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, current_node, node, sequence.copy(), mean)
        forward_pass(G, node, node, right_edge, sequence.copy(), mean)
            
    return
    

def construct_sequences(G: nx.DiGraph, n_tokens):
    for node in G.nodes():
        sequence = np.zeros(n_tokens)
        mean = 0
        sequence[node] = 1
        #sequences.append(sequence) # Do not allow for 1-token sequences.
        forward_pass(G, node, node, n_tokens, sequence.copy(), mean)

In [4]:
# Load Wikipedia dataset.
# TODO: Split between server and local.
#wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']
wiki_dataset = datasets.load_dataset(
    "wikipedia",
    "20220301.en",
    cache_dir="/nfs/data/projects/daniela")['train']

Found cached dataset wikipedia (/nfs/data/projects/daniela/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seed.
utils.fix_seed(41)

Using seed: 41

In [6]:
# Set DSDM hyperparameters.
address_size = 1000
ema_time_period = 7000 # 500
learning_rate_update = 0.5

temperature = 0.05

normalize = False

#chunk_sizes = [5]

prune_mode = "fixed-size"
max_size_address_space = 3000
chunk_score_threshold = 0.9

In [7]:
cleanup = cleanup.Cleanup(address_size)

In [8]:
model_name = "bert-base-uncased"  # Has 12 layers
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

MAXIMUM_SEQUENCE_LENGTH = 512

In [9]:
# Initialize memory.
memory = DSDM.DSDM(
    address_size=address_size,
    ema_time_period=ema_time_period,
    learning_rate_update=learning_rate_update,
    temperature=temperature,
    normalize=normalize,
    prune_mode=prune_mode,
    max_size_address_space=max_size_address_space,
    chunk_score_threshold=chunk_score_threshold,
) 

In [10]:
# Construct train set (texts) and inference set (sentences; in and out of train set text).
train_size = 10
test_size = 10

# Text indeces.
train_idx = np.random.randint(0, len(wiki_dataset), size=train_size)
#train_idx = np.append(np.append(np.append(train_idx[0], train_idx[0]), train_idx[0]), train_idx[0]) 

# Caclulate chosen text statistics.
# TODO

# Text indeces from which we extract sentences.
intest_idx = np.random.choice(train_idx, test_size)
outtest_idx = np.random.choice(np.setdiff1d(np.arange(len(wiki_dataset)), train_idx), test_size)

In [11]:
inference_sentences_in = []
inference_sentences_out = []

for idx_in, idx_out in zip(intest_idx, outtest_idx):
    # Get sentences.
    sentences_in = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_in)]['text'])
    sentences_out = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_out)]['text'])
    
    # Get sentence index.
    sentence_idx_in = int(
        np.random.randint(
            0,
            len(sentences_in),
            size=1
        )
    )
    sentence_idx_out = int(
        np.random.randint(
            0,
            len(sentences_out),
            size=1
        )
    )

    # Append sentence to list.
    inference_sentences_in.append(sentences_in[sentence_idx_in])
    inference_sentences_out.append(sentences_out[sentence_idx_out])

In [12]:
dups_found = 0

def remove_duplicates(memory):
    global dups_found
    global_keep_mask = torch.tensor([True] * len(memory.addresses)).to(device)
    
    for idx, address in enumerate(memory.addresses):
        if global_keep_mask[idx].item():
            cos = torch.nn.CosineSimilarity()
            keep_mask = cos(memory.addresses, address) < 0.95
            # Keep current address
            keep_mask[idx] = True
            global_keep_mask &= keep_mask

    if global_keep_mask.sum().item() > 0:
        dups_found += 1
        # Remove similar addresses
        memory.addresses = memory.addresses[global_keep_mask]
        # Remove bins
        memory.bins = memory.bins[global_keep_mask]
        # Remove chunk scores
        memory.chunk_scores = memory.chunk_scores[global_keep_mask]

In [None]:
# Training
for pos, i in enumerate(tqdm(train_idx)):
    text = wiki_dataset[int(i)]['text']
    
    # Preprocess data. 
    sentences = preprocess.split_text_into_sentences(text)
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt")
        if inputs['input_ids'].shape[1] > MAXIMUM_SEQUENCE_LENGTH:
            break
        
        outputs = model(**inputs, output_attentions=True)
        attention_matrix = outputs.attentions
        
        encoding = tokenizer.encode(sentence)
        labels = tokenizer.convert_ids_to_tokens(encoding)

        i = 0
        averages_idx = []
        while i < len(labels) - 1:
            j = i + 1
            average_idx = []
            while labels[j].startswith('#'):
                average_idx.append(j)
                labels[i] += labels[j].replace('#', '')
                j += 1
            if average_idx != []:
                average_idx.append(i)
                averages_idx.append(average_idx)
            i = j
        
        hashtag_idx = np.array([label.startswith("#") for label in labels])
        stopwords_idx = np.array([label in stopwords.words('english') for label in labels])
        punctuation_idx = np.array([label in string.punctuation for label in labels])
        remove_idx = hashtag_idx | punctuation_idx | stopwords_idx
        labels = np.array(labels)[~remove_idx]
        labels = labels[1:(len(labels) - 1)]

        layer = 0
        for head in range(12):
            head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()
        
            head_scores_raw_tensor = preprocess_attention_scores(head_scores_raw_tensor, averages_idx, remove_idx)
            
            head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()
            
            head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()
        
            as_threshold = 0.5
            head_scores[head_scores < as_threshold] = 0
            
            G = nx.from_numpy_array(head_scores, create_using = nx.DiGraph())
        
            sequences = []
            means = []
            n_tokens = len(labels)
            construct_sequences(G, n_tokens)
                
            df = pd.DataFrame(data=[sequences, means]).T.rename(columns={0: 'seq',  1: 'score'})
            
            if len(df) > 0:
                df['len'] = df['seq'].map(sum)
                df['score'] = df['score'].astype('float64')
                df = df.sort_values(by=['score', 'len'], ascending=[False, False]).reset_index(drop=True)
                top3_df = df.head(3)
            
                for i in range(len(top3_df)):
                    memory.save(
                        inference.generate_query(
                            address_size,
                            cleanup,
                            labels[top3_df['seq'][i].astype(bool)]
                        ),
                        top3_df['score'][i]
                    )
        memory.prune()
#     if (pos + 1) % 50 == 0:
#         remove_duplicates(memory)

 40%|████████████████████████████████████████████████████████▊                                                                                     | 4/10 [01:15<01:46, 17.79s/it]

In [None]:
#inference_sentences_in = ["The Society convenes an annual conference, in locations across the United States and in Canada, usually in June, to convey the James Alice award."]
inference_sentences_in = ["Deputy director flys to the United States."]

In [None]:
len(memory.addresses)

In [None]:
retrieve_mode = "top_k"

# Get table with token similarities for each "out-of-train" sentence.
retrieved_contents = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    inference_sentences_in,
    retrieve_mode=retrieve_mode,
    k=7, #TODO: What if index is out of range?
)

if retrieve_mode == "top_k":
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
    
    for s, addresses in zip(inference_sentences_in, retrieved_contents):
        display(s)
        out_tables = []
        for a in addresses:
            address_sims_df = inference.get_similarities_to_atomic_set(
                a, cleanup)
            out = widgets.Output()
            with out:
                display(address_sims_df)
            out_tables.append(out)
        display(widgets.HBox(out_tables))
elif retrieve_mode == "pooling":  
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
      
    for s, c in zip(inference_sentences_in, retrieved_contents):
        sentence_sims_df = inference.get_similarities_to_atomic_set(
            c, cleanup)
        sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
        sims_df = pd.concat([sims_df, sentence_sims_df])

    sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
                     .set_index(['sentence', 'token'])
    
    display(sims_df)
else:  # unrecognized
    pass

In [None]:
len(memory.addresses)

In [None]:
memory.prune()
addresses = np.random.randint(0, len(memory.addresses), size=70)
#addresses = np.argwhere((memory.chunk_scores > 0.97).cpu().detach().numpy().flatten()).flatten()
#addresses = np.argwhere((memory.bins > 50).cpu().detach().numpy().flatten()).flatten()

for address in np.arange(0, len(memory.addresses)):
    display(md(f"### Address {address}"))
    address_sims_df = inference.get_similarities_to_atomic_set(
            memory.addresses[address],
            cleanup,
    )
    display(address_sims_df)

In [None]:
memory.bins[np.argwhere((memory.chunk_scores > 0.97).cpu().detach().numpy().flatten()).flatten()]

In [None]:
np.argwhere((memory.bins > 120 ).cpu().detach().numpy().flatten()).flatten()

## Appendix

In [None]:
text = "couldn't."
inputs = tokenizer(text, return_tensors="pt")

In [None]:
outputs = model(**inputs, output_attentions=True)
attention_matrix = outputs.attentions

In [None]:
encoding = tokenizer.encode(text)
labels = tokenizer.convert_ids_to_tokens(encoding)

In [None]:
i = 0
averages_idx = []
while i < len(labels) - 1:
    j = i + 1
    average_idx = []
    while labels[j].startswith('#'):
        average_idx.append(j)
        labels[i] += labels[j].replace('#', '')
        j += 1
    if average_idx != []:
        average_idx.append(i)
        averages_idx.append(average_idx)
    i = j

hashtag_idx = np.array([label.startswith("#") for label in labels])
punctuation_idx = np.array([label in string.punctuation for label in labels])
remove_idx = hashtag_idx | punctuation_idx
labels = np.array(labels)[~remove_idx]
labels = labels[1:(len(labels) - 1)]
print(labels)

In [None]:
layer = 0

for head in range(12):
    head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()
    
    head_scores_raw_tensor = preprocess_attention_scores(head_scores_raw_tensor, averages_idx, remove_idx)s
        
    head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()
    
    head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()s

    as_threshold = 0.4
    head_scores[head_scores < as_threshold] = 0
    plot_heatmap(head_scores, labels)
    
    G = nx.from_numpy_array(head_scores, create_using = nx.DiGraph())
    G.edges.data()

    sequences = []
    #mean_scores = []
    n_tokens = len(labels)
    construct_sequences(G, n_tokens)
    # for seq in sequences:
    #     idx = list(itertools.chain(*np.argwhere(seq == 1)))
    #     mean = 0
    #     for i, j in zip(idx[:-1],  idx[1:]):
    #         mean += G[i][j]['weight']
    #     mean /= (len(idx) - 1)
    #     mean_scores.append(round(mean, 2))
        
    # df = pd.DataFrame(data=[sequences, mean_scores]).T.rename(columns={0: 'seq',  1: 'score'})
    # if len(df) > 0:
    #     df['len'] = df['seq'].map(sum)
    #     df['score'] = df['score'].astype('float64')
    #     df = df.sort_values(by=['score', 'len'], ascending=[False, False]).reset_index(drop=True)
    #     top3_df = df.head(3)
    #     display(df)
    
    #     for i in range(len(top3_df)):
    #         print(labels[top3_df['seq'][i].astype(bool)], top3_df['score'][i])
    
    #if sequences != []:
        #layer_sequences.append(sequences)
    if sequences != []:
        print(head)
        for seq in sequences:
            print(labels[seq.astype(bool)])

In [None]:
# text = "Firenze firenze"
# encoding = tokenizer.encode(text)
# labels = tokenizer.convert_ids_to_tokens(encoding)

In [None]:
# i = 0
# averages_idx = []
# while i < len(labels) - 1:
#     j = i + 1
#     average_idx = []
#     while labels[j].startswith('#'):
#         average_idx.append(j)
#         labels[i] += labels[j].replace('#', '')
#         j += 1
#     if average_idx != []:
#         average_idx.append(i)
#         averages_idx.append(average_idx)
#     i = j

# hashtag_idx = np.array([label.startswith("#") for label in labels])
# labels = np.array(labels)[~hashtag_idx]

In [None]:
# Torch implementation.

# t = torch.tensor(head_scores_raw)
# i = torch.tensor(averages_idx)

# t[i] = torch.mean(t[i], dim=1, keepdim=True)
# t = torch.unique_consecutive(t, dim=0)
# t = torch.transpose(t, 0, 1)
# t[i] = torch.mean(t[i], dim=1, keepdim=True)
# t = torch.unique_consecutive(t, dim=0)

# t = torch.transpose(t, 0, 1)