In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
import datasets

import ipywidgets as widgets
from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
from nltk.corpus import stopwords
import numpy as np
import random

import pandas as pd
import pathlib

import string
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm

### Package options ###
torch.set_printoptions(threshold=10_000)

[nltk_data] Downloading package punkt to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /nfs/home/dfichiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
### Utils ###
def plot_heatmap(x: np.array, labels: np.array) -> None:
    plt.figure(figsize=(15, 15))
    sns.heatmap(
        x,
        linewidth=0.5,
        xticklabels=labels,
        yticklabels=labels,
        annot=True,
        fmt='.2f',
    )
    plt.title(f'Self-attention matrix: layer {layer}, head {head}', fontsize=15)
    
    plt.show()
    return

def average_out_and_remove_rows(t: torch.tensor, averages_idx, remove_idx):
    for average_idx in averages_idx:  # The nested lists can have different dimensions.
        # Replace the attention scores of the first token with the average of the token attention scores.
        t[min(average_idx)] = torch.mean(t[average_idx], dim=0, keepdim=True)
    return t[~remove_idx]


def preprocess_attention_scores(attention_scores, averages_idx, remove_idx):
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    attention_scores = attention_scores.transpose(0, 1)
    attention_scores = average_out_and_remove_rows(attention_scores, averages_idx, remove_idx)
    return attention_scores.transpose(0, 1)
        
    

def backward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    in_nodes = np.array([edge[0] for edge in list(G.in_edges(current_node))])
    in_nodes = in_nodes[(in_nodes > left_edge) & (in_nodes < current_node)]
    for node in in_nodes:
        sequence[node] = 1
        sequences.append(sequence)
        mean += G[node][current_node]['weight']
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, left_edge, node, sequence.copy(), mean)
        forward_pass(G, node, left_edge, current_node, sequence.copy(), mean)
        
    return
    
    
def forward_pass(G, current_node, left_edge, right_edge, sequence, mean):
    out_nodes = np.array([edge[1] for edge in list(G.out_edges(current_node))])
    out_nodes = out_nodes[(out_nodes > current_node) & (out_nodes < right_edge)]
    for node in out_nodes:
        sequence[node] = 1
        mean += G[current_node][node]['weight']
        sequences.append(sequence)
        means.append(round(mean / (sum(sequence) - 1), 2))
        backward_pass(G, node, current_node, node, sequence.copy(), mean)
        forward_pass(G, node, node, right_edge, sequence.copy(), mean)
            
    return
    

def construct_sequences(G: nx.DiGraph, n_tokens):
    for node in G.nodes():
        sequence = np.zeros(n_tokens)
        mean = 0
        sequence[node] = 1
        #sequences.append(sequence) # Do not allow for 1-token sequences.
        forward_pass(G, node, node, n_tokens, sequence.copy(), mean)

In [4]:
# Load Wikipedia dataset.
# TODO: Split between server and local.
#wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']
wiki_dataset = datasets.load_dataset(
    "wikipedia",
    "20220301.en",
    cache_dir="/nfs/data/projects/daniela")['train']

Found cached dataset wikipedia (/nfs/data/projects/daniela/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Construct train set (texts) and inference set (sentences; in and out of train set text).
train_size = 5000
test_size = 20

# Text indeces.
train_idx = np.random.randint(0, len(wiki_dataset), size=train_size)
#train_idx = np.append(np.append(np.append(train_idx[0], train_idx[0]), train_idx[0]), train_idx[0]) 


# Text indeces from which we extract sentences.
intest_idx = np.random.choice(train_idx, test_size)
outtest_idx = np.random.choice(np.setdiff1d(np.arange(len(wiki_dataset)), train_idx), test_size)

In [6]:
inference_sentences_in = []
inference_sentences_out = []

for idx_in, idx_out in zip(intest_idx, outtest_idx):
    # Get sentences.
    sentences_in = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_in)]['text'])
    sentences_out = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_out)]['text'])
    
    # Get sentence index.
    sentence_idx_in = np.random.randint(0, len(sentences_in), size=1).item()
    sentence_idx_out = np.random.randint(0, len(sentences_out), size=1).item()

    # Append sentence to list.
    inference_sentences_in.append(sentences_in[sentence_idx_in])
    inference_sentences_out.append(sentences_out[sentence_idx_out])

Concepts: 
- university
- center
- culture
- exam
- day and days
- floor
- stage
- loan
- record

In [7]:
for idx, sentence in enumerate(inference_sentences_in):
    print(f"Sentence {idx + 1}: ", sentence)
    print(" ")

Sentence 1:  Brave was later named as one of the School Library Journal Top 10 Graphic Novels of 2017.
 
Sentence 2:  By the late 19th century, the building was in an extremely dilapidated state and under threat of demolition, before being extensively restored by the socialite, Lady Meux and her husband, in 1889.
 
Sentence 3:  Black was a right-handed batsman who bowled right-arm medium.
 
Sentence 4:  The Wilhelm Schmid Museum was the residence of the artist (1892–1971) and today contains a collection of his works.
 
Sentence 5:  Between 1908 and 1910 he was at the university surgical clinic in Greifswald under Erwin Payr (1871–1947), then went to Königsberg to work with Payr and Paul Leopold Friedrich (1864–1916).
 
Sentence 6:  Daniel Finch may refer to:

Daniel Finch, 2nd Earl of Nottingham and 7th Earl of Winchilsea (1647–1730)
Daniel Finch, 8th Earl of Winchilsea (1689–1769), British politician
 
Sentence 7:  Vinogradi is a village in the municipality of Sandanski, in Blagoevgra

In [8]:
for idx, sentence in enumerate(inference_sentences_out):
    print(f"Sentence {idx}: ", sentence)
    print(" ")

Sentence 0:  He played college football at Alabama and UCF, and was drafted by the Giants in the third round of the 2021 NFL Draft.
 
Sentence 1:  Billings was born at Redcliffe manor on Beech Island, South Carolina, a plantation built by his great-grandfather the senator (famed for the saying "Cotton is king").
 
Sentence 2:  References

External links 
 

1978 births
Living people
Polish footballers
Lech Poznań players
Warta Poznań players
Sportspeople from Poznań
Association football midfielders
 
Sentence 3:  Salchow  may refer to:

 Ulrich Salchow (1877–1949), Swedish figure skater
 Salchow jump, a figure skating jump named after him
 
Sentence 4:  The name is in part derived from the town where they were first observed, Gabela.
 
Sentence 5:  Jacob Gaukel Stroh (25September 184823May 1935) was a local historian of Waterloo County, Ontario.
 
Sentence 6:  References

Further reading

External links

 

Oecophorinae
Moths described in 1907
 
Sentence 7:  Some bebop tunes use a domi