In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
import datasets #import load_dataset

from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import random

import pandas as pd
import pathlib

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm
# Type checking
from typing import List 

In [3]:
# Load Wikipedia dataset.
wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']

Found cached dataset wikipedia (/Users/danielastelea/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seed.
utils.fix_seed(41)

Using seed: 41

In [5]:
cleanup = {} # Cleanup memory for saving atomic HVs

In [6]:
# Set DSDM hyperparameters.
address_size = 1000
ema_time_period = 5000
learning_rate_update = 0.5

temperature = 0.05

normalize = False

chunk_sizes = [5]

prune_mode = "fixed-size"
max_size_address_space = 4000

In [7]:
# Initialize memory.
memory = DSDM.DSDM(
    address_size=address_size,
    ema_time_period=ema_time_period,
    learning_rate_update=learning_rate_update,
    temperature=temperature,
    normalize=normalize,
    prune_mode=prune_mode,
    max_size_address_space=max_size_address_space
) 

In [8]:
# Construct train set (texts) and inference set (sentences; in and out of train set text).
train_size = 1
test_size = 1

# Text indeces.
train_idx = np.random.randint(0, len(wiki_dataset), size=train_size)

# Caclulate chosen text statistics.
# TODO

# Text indeces from which we extract sentences.
intest_idx = np.random.choice(train_idx, test_size)
outtest_idx = np.random.choice(np.setdiff1d(np.arange(len(wiki_dataset)), train_idx), test_size)

In [9]:
inference_sentences_in = []
inference_sentences_out = []

for idx_in, idx_out in zip(intest_idx, outtest_idx):
    # Get sentences.
    sentences_in = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_in)]['text'])
    sentences_out = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_out)]['text'])
    
    # Get sentence index.
    sentence_idx_in = int(
        np.random.randint(
            0,
            len(sentences_in),
            size=1
        ))
    sentence_idx_out = int(
        np.random.randint(
            0,
            len(sentences_out),
            size=1
        ))

    # Append sentence to list.
    inference_sentences_in.append(sentences_in[sentence_idx_in])
    inference_sentences_out.append(sentences_out[sentence_idx_out])

In [10]:
# Training
for i in tqdm(train_idx):
    text = wiki_dataset[int(i)]['text']
    
    # Preprocess data. 
    sentences_tokens = preprocess.preprocess_text(text)
    
    for sentence_tokens in sentences_tokens:
        # Generate atomic HVs for unknown tokens.
        learning.generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(
            memory.address_size,
            cleanup,
            sentence_tokens
        )
        
        # Learning: Construct the chunks of each sentence and save them to memory.
        learning.generate_chunk_representations_and_save_them_to_memory(
            memory.address_size,
            cleanup,
            memory,
            sentence_tokens,
            chunk_sizes=chunk_sizes
        )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.37it/s]


In [11]:
# Get table with token similarities for each "in-train" sentence.
in_sims_df = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    inference_sentences_in,
    retrieve_mode="top_k",
    k=5,
)
display(in_sims_df)

TypeError: only integer tensors of a single element can be converted to an index

In [None]:
# Get table with token similarities for each "out-of-train" sentence.
out_sims_df = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    inference_sentences_out,
    retrieve_mode="top_k",
    k=5,
)
display(out_sims_df)

In [None]:
sims_df = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    ['The index was created in the early 1960s to limit the sale of such works to minors due to their chauvinism and glorification of violence.']
)
display(sims_df)

In [None]:
addresses = np.random.randint(0, len(memory.addresses), size=10)

for address in addresses:
    display(md(f"### Address {address}"))
    retrieved_content = memory.addresses[address]
    
    memory_sims_df = pd.DataFrame(columns=['token', 'similarity'])
    
    for token, atomic_HC in cleanup.items():
        memory_sims_df = pd.concat([memory_sims_df, pd.DataFrame([{'token': token,
                                                                   'similarity': thd.cosine_similarity(atomic_HC, retrieved_content).item()}])])
    memory_sims_df = memory_sims_df.sort_values('similarity', ascending=False).reset_index(drop=True)
    display(memory_sims_df.head(10))

In [None]:
memory.n_updates / (memory.n_updates + memory.n_expansions)

In [None]:
memory.n_updates

In [None]:
memory.n_expansions

In [None]:
len(memory.addresses)

In [None]:
torch.tensor([1, 2, 3, 4])[[False, True, True, True]]