# Sliding window n-gram method

In [24]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [25]:
import datasets

from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import cleanup, configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import random

import pandas as pd
import pathlib

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm
# Type checking
import typing

In [3]:
# Load Wikipedia dataset.
# TODO: Split between server and local.
#wiki_dataset = datasets.load_dataset("wikipedia", "20220301.en")['train']
wiki_dataset = datasets.load_dataset(
    "wikipedia",
    "20220301.en",
    cache_dir="/nfs/data/projects/daniela")['train']

Found cached dataset wikipedia (/nfs/data/projects/daniela/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seed.
utils.fix_seed(41)

Using seed: 41

In [5]:
# Set DSDM hyperparameters.
address_size = 1000

ema_time_period = 5000
learning_rate_update = 0.8

temperature = 0.5

normalize = False

chunk_sizes = [3]

# Pruning
prune_mode = None
max_size_address_space = 4000


# Pruning: Remove dups
remove_dups = False

In [6]:
cleanup = cleanup.Cleanup(address_size)

In [7]:
# Initialize memory.
memory = DSDM.DSDM(
    address_size=address_size,
    ema_time_period=ema_time_period,
    learning_rate_update=learning_rate_update,
    temperature=temperature,
    normalize=normalize,
    prune_mode=prune_mode,
    max_size_address_space=max_size_address_space
) 

In [8]:
# Construct train set (texts) and inference set (sentences; in and out of train set text).
train_size = 250
test_size = 10

# Text indeces
train_idx = np.random.randint(0, len(wiki_dataset) - 100, size=train_size)
# Generate and append train articles present in all experiments.
intest_idx = np.random.randint(len(wiki_dataset) - 100, len(wiki_dataset), size=20)
_set = list(set(intest_idx))
intest_idx = np.array(_set)[: len(_set) // 2]
#outtest_idx = np.array(_set)[len(_set) // 2 :]
train_idx = np.append(train_idx, intest_idx)

# Text indeces from which we extract sentences.
#intest_idx = np.random.choice(train_idx, test_size)
#outtest_idx = np.random.choice(np.setdiff1d(np.arange(len(wiki_dataset)), train_idx), test_size)

In [9]:
# inference_sentences_in = []
# inference_sentences_out = []

# for idx_in, idx_out in zip(intest_idx, outtest_idx):
#     # Get sentences.
#     sentences_in = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_in)]['text'])
#     sentences_out = utils.preprocess.split_text_into_sentences(wiki_dataset[int(idx_out)]['text'])
    
#     # Get sentence index.
#     sentence_idx_in = int(
#         np.random.randint(
#             0,
#             len(sentences_in),
#             size=1
#         ).item()
#     )
#     sentence_idx_out = int(
#         np.random.randint(
#             0,
#             len(sentences_out),
#             size=1
#         ).item()
#     )

#     # Append sentence to list.
#     inference_sentences_in.append(sentences_in[sentence_idx_in])
#     inference_sentences_out.append(sentences_out[sentence_idx_out])

In [10]:
### Remove duplicates ###

dups_found = 0

def remove_duplicates(memory):
    global dups_found
    global_keep_mask = torch.tensor([True] * len(memory.addresses)).to(device)
    
    for idx, address in enumerate(memory.addresses):
        if global_keep_mask[idx].item():
            cos = torch.nn.CosineSimilarity()
            keep_mask = cos(memory.addresses, address) < 0.8
            # Keep current address
            keep_mask[idx] = True
            global_keep_mask &= keep_mask

    if global_keep_mask.sum().item() > 0:
        dups_found += len(global_keep_mask) - global_keep_mask.sum().item()
        # Remove similar addresses
        memory.addresses = memory.addresses[global_keep_mask]
        # Remove bins & chunk scores
        memory.scores = memory.scores[global_keep_mask]

In [11]:
# Training
for i in tqdm(train_idx):
    text = wiki_dataset[int(i)]['text']
    
    # Preprocess data. 
    sentences_tokens = preprocess.preprocess_text(text)
    for sentence_tokens in sentences_tokens:
        # Generate atomic HVs for unknown tokens.
        learning.generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(
            memory.address_size,
            cleanup,
            sentence_tokens
        )
        
        # Learning: Construct the chunks of each sentence and save them to memory.
        learning.generate_chunk_representations_and_save_them_to_memory(
            memory.address_size,
            cleanup,
            memory,
            sentence_tokens,
            chunk_sizes=chunk_sizes
        )
    if remove_dups:
        remove_duplicates(memory)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [01:56<00:00,  2.23it/s]


In [12]:
# inference_sentences_in = ['Dagored', 'is an Italian', 'record labels', 'based in Firenze', 'formed', 'in 1998.'] 250, 0.05 temperature
# 'record labels' also caught by transformer attention.

In [13]:
# def score_partition(input_partition, output_partition):
#     # Note: What if a sentence contains the same word multiple times? This is why using 'set' is  bad idea!
#     set_query = set(preprocess.remove_stopwords(tokens)[0]) 
#     set_content = inference.get_most_similar_HVs(sentence_sims_df, delta_threshold=0.1)

#     set_input = set(input_partition)
#     set_output = set(output_partition)
    
#     score = len(set_input.intersection(set_output)) / len(set_input)

#     return score




# def divide_and_conquer(token_partitions: typing.List[typing.List[str]]):
#     retrieve_mode = "pooling"
    
#     for tp in token_partitions:
#         retrieved_content = inference.infer(
#             memory.address_size,
#             cleanup,
#             memory,
#             [tp],
#             retrieve_mode=retrieve_mode
#         )
#         output_tokens = inference.get_most_similar_HVs(
#             inference.get_similarities_to_atomic_set(
#                 retrieved_contents[0],
#                 cleanup,
#             ),
#             delta_threshold=0.1
#         )
#         score = score_partition(tp, output_tokens)
    

    

#     display(score)
#     if score == 1:
#         return tokens
#     else:
#         return max(score, divide_and_conquer())
    

In [14]:
# inference_sentences_in = ["Senex is a Latin word literally meaning a man of old age."]

In [15]:
# retrieve_mode = "top_k"

# # Get table with token similarities for each "out-of-train" sentence.
# retrieved_contents = inference.infer(
#     memory.address_size,
#     cleanup,
#     memory,
#     inference_sentences_in,
#     retrieve_mode=retrieve_mode,
#     k=3, #TODO: What if index is out of range?
# )

# if retrieve_mode == "top_k":
#     sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
    
#     for s, addresses in zip(inference_sentences_in, retrieved_contents):
#         display(s)
#         for a in addresses:
#             address_sims_df = inference.get_similarities_to_atomic_set(
#                 a, cleanup)
#             display(address_sims_df)
# elif retrieve_mode == "pooling":  
#     sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity']) 
      
#     for s, c in zip(inference_sentences_in, retrieved_contents):
#         sentence_sims_df = inference.get_similarities_to_atomic_set(
#             c, cleanup)
#         sentence_sims_df['sentence'] = [s] * len(sentence_sims_df)
#         sims_df = pd.concat([sims_df, sentence_sims_df])

#     sims_df = sims_df.sort_values(['sentence', 'similarity'], ascending=False) \
#                      .set_index(['sentence', 'token'])
    
#     display(sims_df)
# else:  # unrecognized
#     pass

## Statistics address space

In [23]:
#addresses = np.random.randint(0, len(memory.addresses), size=30)
addresses = [244, 245, 246, 247]
#addresses = [56, 55, 54, 53, 52, 51]
for address in addresses:
    display(md(f"### Address {address}"))
    address_sims_df = inference.get_similarities_to_atomic_set(
            memory.addresses[address],
            cleanup,
    )
    display(address_sims_df)


### Address 244

Unnamed: 0,token,similarity
0,pyrenula,0.373351
1,creek,0.289624
2,–,0.288052
3,mills,0.189147
4,born,0.17694
5,new,0.173557
6,nescopeck,0.170269
7,american,0.143547
8,two,0.142125
9,university,0.130298


### Address 245

Unnamed: 0,token,similarity
0,pyrenula,0.368364
1,creek,0.295643
2,–,0.291094
3,mills,0.188769
4,born,0.180576
5,nescopeck,0.173325
6,new,0.172754
7,american,0.142314
8,two,0.137888
9,watershed,0.135716


### Address 246

Unnamed: 0,token,similarity
0,pyrenula,0.371117
1,–,0.292351
2,creek,0.291638
3,mills,0.190354
4,born,0.179786
5,new,0.17642
6,nescopeck,0.173126
7,american,0.141062
8,two,0.138602
9,university,0.13164


### Address 247

Unnamed: 0,token,similarity
0,pyrenula,0.370287
1,–,0.299116
2,creek,0.295333
3,mills,0.185233
4,new,0.173719
5,nescopeck,0.172965
6,born,0.171152
7,american,0.142391
8,two,0.138714
9,watershed,0.134785


In [17]:
memory.n_updates / (memory.n_updates + memory.n_expansions)

0.6107950104730047

In [18]:
memory.n_updates

39075

In [19]:
memory.n_expansions

24899

In [20]:
len(memory.addresses)

24899

In [21]:
memory.n_deletions

0

In [22]:
dups_found

0