In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
%%capture
import datasets

import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
from nltk.corpus import stopwords
import numpy as np
import random

import pandas as pd
import pathlib
import pickle

import string
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm

### Package options ###
torch.set_printoptions(threshold=10_000)

In [3]:
### Utils ###
def average_out_and_remove_rows(t: torch.tensor, averages_idx, remove_idx):
    for average_idx in averages_idx:  # The nested lists can have different dimensions.
        # Replace the attention scores of the first token with the average of the token attention scores.
        t[min(average_idx)] = torch.mean(t[average_idx], dim=0, keepdim=True)
    return t[~remove_idx]


def preprocess_attention_scores(attention_scores, averages_idx, remove_idx):
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    attention_scores = attention_scores.transpose(0, 1)
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    return attention_scores.transpose(0, 1)
        
    

def backward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    in_nodes = np.array([edge[0] for edge in list(G.in_edges(current_node))])
    in_nodes = in_nodes[(in_nodes > left_edge) & (in_nodes < current_node)]
    for node in in_nodes:
        sequence[node] = 1
        sequences.append(sequence)
        mean += G[node][current_node]['weight']
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, left_edge, node, sequence.copy(), mean)
        forward_pass(G, node, left_edge, current_node, sequence.copy(), mean)
        
    return
    
    
def forward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    out_nodes = np.array([edge[1] for edge in list(G.out_edges(current_node))])
    out_nodes = out_nodes[(out_nodes > current_node) & (out_nodes < right_edge)]
    for node in out_nodes:
        sequence[node] = 1
        mean += G[current_node][node]['weight']
        sequences.append(sequence)
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, current_node, node, sequence.copy(), mean)
        forward_pass(G, node, node, right_edge, sequence.copy(), mean)
            
    return
    

def construct_sequences(G: nx.DiGraph, n_tokens):
    for node in G.nodes():
        sequence = np.zeros(n_tokens)
        mean = 0
        sequence[node] = 1
        #sequences.append(sequence) # Do not allow for 1-token sequences.
        forward_pass(G, node, node, n_tokens, sequence.copy(), mean)

In [4]:
%%capture
# Load Wikipedia dataset.
# TODO: Split between server and local.
#wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']
wiki_dataset = datasets.load_dataset(
    "wikipedia",
    "20220301.en",
    cache_dir="/nfs/data/projects/daniela")['train']

In [5]:
model_name = "bert-base-uncased"  # Has 12 layers
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

MAXIMUM_SEQUENCE_LENGTH = 512

In [6]:
as_threshold = 0.5
n_sequences = 3
chunk_score_threshold = None
remove_stopwords = True

In [7]:
in_sentences = [
    """Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations.""",
    """In 1910, she was elected to the position of organizer and lecturer of the National WCTU.""",
    """Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade.""",
    """With all other games played, a victory over Everton had put United top of the group on nine points.""",
    """The 2022 FA Women's League Cup Final was the 11th final of the FA Women's League Cup, England's secondary cup competition for women's football teams and its primary league cup tournament.""",
    """In 2020 Mico's single 'igare' awarded as the best song of the summer in Kiss Summer Awards.""",
    """She collected the speech and words of Dublin city and donated her collection to the Department of Irish Folklore at University College, Dublin.""",
    """Traditional palyanytsya was baked from yeast dough.""",
    """First, hops were boiled in a pot, which was then poured into a makitra, to which sifted wheat flour was added.""",
    """Jonathan Holland of ScreenDaily deemed the film to be "superbly directed by Palomero, who seems to have a special gift for seeing the world through children's eyes." """   
]

In [8]:
out_sentences = [
    """As the population of all of the towns grew, the need for better transportation between them also grew.""",
    """The construction of the line was the subject of a legal challenge.""",
    """The extremes, which can be fatal to healthy people within six hours, could affect hundreds of millions of people unused to such conditions.""",
    """Dangerous heat stress events will spread rapidly across the world as global heating continues.""",
    """Whether or not history will determine that we are living in an ever more divided culture, it certainly feels that way."""
]

In [9]:
display(md(f"# Constructed subsequnces {'w/o' if remove_stopwords else 'w/'} stop words"))

# Constructed subsequnces w/o stop words

In [10]:
sentences = in_sentences

In [11]:
for sentence in sentences:
    display(md(f"""Sentence: _{sentence}_"""))
    inputs = tokenizer(sentence, return_tensors="pt")
    if inputs['input_ids'].shape[1] > MAXIMUM_SEQUENCE_LENGTH:
        break

    outputs = model(**inputs, output_attentions=True)
    attention_matrix = outputs.attentions

    encoding = tokenizer.encode(sentence)
    labels = tokenizer.convert_ids_to_tokens(encoding)

    i = 0
    averages_idx = []
    while i < len(labels) - 1:
        j = i + 1
        average_idx = []
        while labels[j].startswith('#'):
            average_idx.append(j)
            labels[i] += labels[j].replace('#', '')
            j += 1
        if average_idx != []:
            average_idx.append(i)
            averages_idx.append(average_idx)
        i = j

    hashtag_idx = np.array([label.startswith("#") for label in labels])
    stopwords_idx = np.array([label in stopwords.words('english') for label in labels])
    punctuation_idx = np.array([label in string.punctuation for label in labels])
    dash_idx = np.array([(len(label) == 1 and ord(label) == 8211) for label in labels])
    remove_idx = hashtag_idx | punctuation_idx | dash_idx
    if remove_stopwords:
        remove_idx |= stopwords_idx
    labels = np.array(labels)[~remove_idx]
    labels = labels[1:(len(labels) - 1)]

    layer = 0
    for head in range(12):
        head_scores_raw_tensor = attention_matrix[layer][0][head].detach().clone()

        head_scores_raw_tensor = preprocess_attention_scores(head_scores_raw_tensor, averages_idx, remove_idx)

        head_scores_raw = head_scores_raw_tensor.cpu().detach().numpy()

        head_scores = head_scores_raw[1:(len(head_scores_raw) - 1), 1:(len(head_scores_raw) - 1)].copy()

        head_scores[head_scores < as_threshold] = 0

        G = nx.from_numpy_array(head_scores, create_using=nx.DiGraph())

        sequences = []
        means = []
        n_tokens = len(labels)
        construct_sequences(G, n_tokens)

        df = pd.DataFrame(data=[sequences, means]).T.rename(columns={0: 'seq',  1: 'score'})

        if len(df) > 0:
            df['len'] = df['seq'].map(sum)
            df['score'] = df['score'].astype('float64')
            df = df.sort_values(by=['score', 'len'], ascending=[False, False]).reset_index(drop=True)
            
            df['sequence'] = df['seq'].apply(lambda x: labels[x.astype(bool)])
            df['chunk_score'] = df['score']
            display(md(f"**Head:** {head}"))
            display(df[['chunk_score', 'len', 'sequence']])

            # Select sequences to be save to memory.
            if n_sequences is not None:
                filtered_df = df.head(n_sequences)
            elif chunk_score_threshold is not None:
                filtered_df = df[df['score'] >= chunk_score_threshold]
            else:
                filtered_df = df.head(3)

Sentence: _Blaine was reared in a Prohibition home, and while still a young girl, she became a very active participant at temperance meetings, where she won great favor for her songs and recitations._

**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.96,2.0,"[young, girl]"
1,0.81,2.0,"[active, participant]"
2,0.77,2.0,"[temperance, meetings]"
3,0.52,2.0,"[great, favor]"


Sentence: _In 1910, she was elected to the position of organizer and lecturer of the National WCTU._

**Head:** 11

Unnamed: 0,chunk_score,len,sequence
0,0.52,2.0,"[elected, position]"


Sentence: _Another feature of her work was the organization of temperance mass-meetings of Sunday-school children, usually preceded by a formal parade._

**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.84,2.0,"[school, children]"
1,0.74,3.0,"[sunday, school, children]"
2,0.7,2.0,"[another, feature]"
3,0.69,2.0,"[usually, preceded]"
4,0.65,2.0,"[sunday, school]"
5,0.52,2.0,"[formal, parade]"


**Head:** 11

Unnamed: 0,chunk_score,len,sequence
0,0.76,2.0,"[school, children]"


Sentence: _With all other games played, a victory over Everton had put United top of the group on nine points._

**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.72,2.0,"[nine, points]"
1,0.52,2.0,"[games, played]"


Sentence: _The 2022 FA Women's League Cup Final was the 11th final of the FA Women's League Cup, England's secondary cup competition for women's football teams and its primary league cup tournament._

**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.84,2.0,"[league, cup]"
1,0.83,2.0,"[primary, league]"
2,0.78,3.0,"[primary, league, cup]"
3,0.73,2.0,"[league, cup]"
4,0.73,2.0,"[league, cup]"
5,0.7,2.0,"[fa, women]"
6,0.67,2.0,"[fa, women]"
7,0.66,2.0,"[cup, competition]"
8,0.66,2.0,"[football, teams]"
9,0.61,2.0,"[11th, final]"


Sentence: _In 2020 Mico's single 'igare' awarded as the best song of the summer in Kiss Summer Awards._

**Head:** 1

Unnamed: 0,chunk_score,len,sequence
0,0.64,2.0,"[single, song]"


**Head:** 8

Unnamed: 0,chunk_score,len,sequence
0,0.52,2.0,"[summer, awards]"


**Head:** 9

Unnamed: 0,chunk_score,len,sequence
0,0.58,2.0,"[awarded, awards]"


**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.93,2.0,"[summer, awards]"
1,0.88,3.0,"[kiss, summer, awards]"
2,0.84,2.0,"[kiss, summer]"
3,0.76,2.0,"[best, song]"


Sentence: _She collected the speech and words of Dublin city and donated her collection to the Department of Irish Folklore at University College, Dublin._

**Head:** 8

Unnamed: 0,chunk_score,len,sequence
0,0.58,2.0,"[college, dublin]"


**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.87,2.0,"[university, college]"
1,0.65,2.0,"[irish, folklore]"
2,0.64,2.0,"[dublin, city]"


**Head:** 11

Unnamed: 0,chunk_score,len,sequence
0,0.58,2.0,"[collection, folklore]"


Sentence: _Traditional palyanytsya was baked from yeast dough._

Sentence: _First, hops were boiled in a pot, which was then poured into a makitra, to which sifted wheat flour was added._

**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.93,2.0,"[wheat, flour]"


**Head:** 11

Unnamed: 0,chunk_score,len,sequence
0,0.52,2.0,"[wheat, flour]"


Sentence: _Jonathan Holland of ScreenDaily deemed the film to be "superbly directed by Palomero, who seems to have a special gift for seeing the world through children's eyes." _

**Head:** 10

Unnamed: 0,chunk_score,len,sequence
0,0.62,2.0,"[special, gift]"
1,0.56,2.0,"[jonathan, holland]"
