In [1]:
import sys
import os

# Get the absolute path of the parent directory.
parent_dir = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

# Add the parent directory to the system path to be able to import modules from 'lib.'
sys.path.append(parent_dir)

In [2]:
from datasets import load_dataset

from IPython.display import HTML, Markdown as md
import itertools

from lib.memory import DSDM
from lib.utils import configs, inference, learning, preprocess, utils 

import math
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import random

import pandas as pd
import pathlib

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 

from tqdm import tqdm
# Type checking
from typing import List 

In [3]:
wiki_dataset = load_dataset("wikipedia", "20220301.en")['train']

Found cached dataset wikipedia (/Users/danielastelea/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Fix seed.
utils.fix_seed()

Using seed: 42

In [5]:
cleanup = {} # Cleanup memory for saving atomic HVs

In [6]:
address_size = 1000
ema_time_period = 5000
learning_rate_update = 0.5

temperature = 0.05

normalize = False

chunk_sizes = [5]

prune_mode = "fixed-size"
max_size_address_space = 4000

In [7]:
# Initialize memory.
memory = DSDM.DSDM(
    address_size=address_size,
    ema_time_period=ema_time_period,
    learning_rate_update=learning_rate_update,
    temperature=temperature,
    normalize=normalize,
    prune_mode=prune_mode,
    max_size_address_space=max_size_address_space
) 

In [46]:
# Training
for i in [5, 6, 7, 8, 9]:
    text = wiki_dataset[i]['text']
    
    # Preprocess data. 
    sentences_tokens = preprocess.preprocess_text(text)
    
    for sentence_tokens in sentences_tokens:
    # Generate atomic HVs for unknown tokens.
        learning.generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(
            memory.address_size,
            cleanup,
            sentence_tokens
        )
        
        # Learning: Construct the chunks of each sentence and save them to memory.
        learning.generate_chunk_representations_and_save_them_to_memory(
            memory.address_size,
            cleanup,
            memory,
            sentence_tokens,
            chunk_sizes=chunk_sizes
        )

In [55]:
inference_sentences = ['Best playing design award', 'goes to', 'autistic children.']
# Inference
# Get tabale with token similarities for each sentece.
sims_df = inference.infer(
    memory.address_size,
    cleanup,
    memory,
    inference_sentences
)

In [56]:
sims_df

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sentence,token,Unnamed: 2_level_1
goes to,anarchist,0.250885
goes to,goes,0.233868
goes to,anarchists,0.232188
goes to,without,0.223585
goes to,state,0.215361
goes to,social,0.202417
goes to,anarchism,0.184874
goes to,autism,0.18013
goes to,may,0.179609
goes to,would,0.176988


In [57]:
retrieved_content = memory.addresses[1000]

In [42]:
memory_sims_df = pd.DataFrame(columns=['token', 'similarity'])

for token, atomic_HC in cleanup.items():
    memory_sims_df = pd.concat([memory_sims_df, pd.DataFrame([{'token': token,
                                                               'similarity': thd.cosine_similarity(atomic_HC, retrieved_content).item()}])])
memory_sims_df = memory_sims_df.sort_values('similarity', ascending=False).reset_index(drop=True)

In [43]:
memory_sims_df.head(10)

Unnamed: 0,token,similarity
0,uniform,0.470373
1,local,0.446057
2,principles,0.437959
3,individual,0.429222
4,autonomy,0.425295
5,orleans,0.117654
6,overwhelmingly,0.107291
7,meaningful,0.099311
8,newsrecord,0.096583
9,isolation,0.094099


In [44]:
memory.n_updates / (memory.n_updates + memory.n_expansions)

0.16086827237585488

In [37]:
memory.n_updates

2164

In [38]:
memory.n_expansions

11288

In [39]:
len(memory.addresses)

4000

In [58]:
text

"The Academy Award for Best Production Design recognizes achievement for art direction in film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy of Motion Picture Arts and Sciences (AMPAS) being renamed the Designer's branch. Since 1947, the award is shared with the set decorator(s). It is awarded to the best interior design in a film.\n\nThe films below are listed with their production year (for example, the 2000 Academy Award for Best Art Direction is given to a film from 1999). In the lists below, the winner of the award for each year is shown first, followed by the other nominees in alphabetical order.\n\nSuperlatives\n\nWinners and nominees\n\n1920s\n\n1930s\n\n1940s\n\n1950s\n\n1960s\n\n1970s\n\n1980s\n\n1990s\n\n2000s\n\n2010s\n\n2020s\n\nSee also\n BAFTA Award for Best Production Design\n Critics' Choice Movie Award for Best Production 