# DSDM 



In [1]:
from IPython.display import display, Markdown as md
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import random

import pandas as pd
import pathlib
from preprocess import preprocess_text

from sklearn.neighbors import LocalOutlierFactor

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 
from tqdm import tqdm
from typing import List

In [2]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Vector dimension
dim = 2000 

cleanup = {}

In [3]:
def fix_seed():
    seed = 42
    print("[ Using Seed : ", seed, " ]")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    numpy.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def load_data(path, bs=0, shuffle=False):
    """Load data from file path. """
    text = pathlib.Path(path).read_text(encoding='utf-8')
    return text.splitlines()


def compute_distances_gpu(X, Y):
    """Compute Euclidean distance."""
    return torch.sqrt(-2 * torch.mm(X,Y.T) +
                    torch.sum(torch.pow(Y, 2),dim=1) +
                    torch.sum(torch.pow(X, 2),dim=1).view(-1,1))

In [4]:
class SONN(nn.Module):
    def __init__(self, address_size, learning_rate_ema, learning_rate_update, temperature, normalize=False):
        super(SONN, self).__init__()
        self.address_size = address_size
        self.addresses = torch.zeros(1, address_size).to(device)

        self.normalize = normalize

        self.ema = 0
        self.learning_rate_ema = learning_rate_ema
        self.learning_rate_update = learning_rate_update

        self.temperature = temperature
        
    def reset(self):
        self.ema = 0
        self.addresses = torch.zeros(1, self.address_size).to(device)
        
    def retrieve(self, query_address):
        with torch.no_grad():
            retrieved_content = torch.tensor([]).to(device)

            cos = torch.nn.CosineSimilarity()
            # Calculate the cosine similarities.
            if self.normalize: 
                similarities = cos(self.addresses.sgn(), query_address.sgn())
            else:
                similarities = cos(self.addresses, query_address)
            # Cosine distance tensor
            distances = 1 - similarities

            # Calculate the softmin weights.
            softmin_weights = F.softmin(distances/self.temperature, dim=-1)

            # Weight the memory addresses with the softmin weights.
            weighted_addresses = torch.matmul(softmin_weights, self.addresses.to(device)).view(-1)

            # Pool the weighted memory addresses to create the output.
            retrieved_content = torch.sum(weighted_addresses.view(1, -1), 0)

        return retrieved_content   

    
    def save(self, query_address):
        cos = torch.nn.CosineSimilarity()
        # Calculate the cosine similarities.
        if self.normalize: 
            similarities = cos(self.addresses.sgn(), query_address.sgn())
        else:
            similarities = cos(self.addresses, query_address)

        # Calculate the cosine distances.
        distances = 1 - similarities
        # Get the minimum distance and the corresponding address index.  
        min_distance = torch.min(distances, dim=0)[0].item()
        
        # Check if the minimum distance is bigger than the adaptive threshold.
        if min_distance > self.ema: # If the minimum distance is bigger, create a new address.
            # Add a new entry to the address matrix/tensor equal to the target address.
            self.addresses = torch.cat((self.addresses, query_address.view(1, -1)))
        else: # If the minimum distance is smaller or equal, update the memory addresses.
            # Apply the softmin function to the distance tensor the get the softmin weights.
            softmin_weights = F.softmin(distances/self.temperature, dim=-1)
            # Update the memory address space.
            self.addresses += self.learning_rate_update * torch.mul(softmin_weights.view(-1, 1), query_address - self.addresses)
        
        # Update the adaptive threshold.
        self.ema = self.learning_rate_ema * self.ema + (1 - self.learning_rate_ema) * min_distance

        return

In [5]:
def generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(tokens: List[str]) -> None:
    global cleanup, dim

    for token in tokens:
        # Check if the token has been encountered before by querying the cleanup memory.
        entry = cleanup.get(token)
        # If it hasn't, 
        if entry == None:
            # Generate a random HV representation for the token.
            atomic_HV = thd.MAPTensor.random(1, dim)[0]
            # Add the HV to the cleanup memory.
            cleanup[token] = atomic_HV
    
    return


def generate_chunk_representations_and_save_them_to_memory(memory, tokens, chunk_lengths=[], output=False):
    # "n" represents the no. of tokens in the sentence, which is also the max. no. of tokens 
    # that can be grouped to form a chunk.
    n = len(tokens)
    chunk_lengths = np.array(chunk_lengths, dtype=int)

    # Generate all possible chunks.
    if len(chunk_lengths) == 0:
        chunk_lengths = np.arange(1, n +  1)
    else:
        # Remove lengths which are bigger than the maximum chunk length.
        chunk_lengths = chunk_lengths[chunk_lengths <= n]
   
    for no_tokens in chunk_lengths:
        if output:
            print("no. of tokens: ", no_tokens)
        for i in range(n):
            if output:
                print("start index: ", i)
            # If there are not enough tokens left to construct a chunk comprised of "no_tokens", break. 
            if i + no_tokens > len(tokens):
                if output:
                    print("Not enough tokens left.")
                break 
            HC_representation = thd.MAPTensor.empty(1, dim)[0]

            # Construct HC representation.
            for j in range(no_tokens):
                if output:
                    print(tokens[i + j])
                HC_representation += cleanup[tokens[i + j]]

            # Save the chunk HC representation to memory.
            memory.save(HC_representation)

    return

def generate_query(tokens: list):
  n = len(tokens)
  HC_representation = thd.MAPTensor.empty(1, dim)

  for i in range(n):
    # The token hasn't been encountered before.
    if cleanup.get(tokens[i]) == None:
        # Generate an atomic HC for the unencountered token.
        atomic_HC = thd.MAPTensor.random(1, dim)[0]
        # Add the atomic HC to the cleanup memory.
        cleanup[tokens[i]] = atomic_HC
        # Add the atomic (i.e., superpose) HC to the chunk HC representation.
        HC_representation += atomic_HC
    # The token has been encountered before.
    else:
        HC_representation += cleanup[tokens[i]]

    return HC_representation

## Run experiment

In [6]:
# Load data.
lines_raw = load_data('../data/data.txt')

# Preprocess input. 
lines_tokens = []
for line_raw in lines_raw:
    if line_raw.rstrip():
        lines_tokens.append(preprocess_text(line_raw))


address_size = dim
learning_rate_ema = 0.13
learning_rate_update = 0.004
temperature = 2.3

# Create DSDM instance.
memory_unnormalized = SONN(address_size=address_size, learning_rate_ema=learning_rate_ema, learning_rate_update=learning_rate_update, temperature=temperature)
memory_normalized = SONN(address_size=address_size, learning_rate_ema=learning_rate_ema, learning_rate_update=learning_rate_update, temperature=temperature, normalize=True)

memories = {"normalized": memory_normalized, "unnormalized": memory_unnormalized}

# Flush cleanup memory.
cleanup = {}

# Train memory.
for sentence_tokens in lines_tokens:
    generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(sentence_tokens)
    for _, memory in memories.items():
        generate_chunk_representations_and_save_them_to_memory(memory, sentence_tokens, chunk_lengths=[2, 3])


In [13]:
def get_concepts(memory, sentence):
  display(md(f"**Sentence**: {sentence}"))
  retrieved_content = memory.retrieve(generate_query(preprocess_text(sentence)))

  sims_df = pd.DataFrame(columns=['token', 'sim'])

  for key, item in cleanup.items():
    sims_df = pd.concat([sims_df, pd.DataFrame([{'token': key, 'sim': thd.cosine_similarity(item,  retrieved_content).item()}])])

  display(sims_df.sort_values('sim', ascending=False).reset_index(drop=True))
  return

for memory_type, memory in memories.items():
    display(md(f"### <ins>{memory_type.capitalize()}</ins>"))
    get_concepts(memory, "The red house.")
    get_concepts(memory, "The house.")
    get_concepts(memory, "House.")


### <ins>Normalized</ins>

**Sentence**: The red house.

Unnamed: 0,chunk,sim
0,the,0.643602
1,red,0.574333
2,house,0.360556
3,blue,0.305549
4,yellow,0.088265
5,green,0.064607


**Sentence**: The house.

Unnamed: 0,chunk,sim
0,the,0.643602
1,red,0.574333
2,house,0.360556
3,blue,0.305549
4,yellow,0.088265
5,green,0.064607


**Sentence**: House.

Unnamed: 0,chunk,sim
0,the,0.619265
1,red,0.581967
2,house,0.390354
3,blue,0.307864
4,yellow,0.090937
5,green,0.060994


### <ins>Unnormalized</ins>

**Sentence**: The red house.

Unnamed: 0,chunk,sim
0,the,0.677186
1,house,0.503781
2,red,0.360327
3,blue,0.285576
4,yellow,0.202818
5,green,0.173281


**Sentence**: The house.

Unnamed: 0,chunk,sim
0,the,0.677186
1,house,0.503781
2,red,0.360327
3,blue,0.285576
4,yellow,0.202818
5,green,0.173281


**Sentence**: House.

Unnamed: 0,chunk,sim
0,house,0.629846
1,the,0.563269
2,red,0.352134
3,blue,0.297484
4,yellow,0.216424
5,green,0.175963


In [14]:
def print_memory_addresses(memory):
  print("Number of constructed addresses/abstract concepts: ", len(memory.addresses))


  for address in memory.addresses:
    sims_df = pd.DataFrame(columns=['token', 'sim'])
    for key, item in cleanup.items():
      sims_df = pd.concat([sims_df, pd.DataFrame([{'token': key, 'sim': thd.cosine_similarity(item,  address).item()}])])

    display(sims_df.sort_values('sim', ascending=False))

  return

for memory_type, memory in memories.items():
    display(md(f"### <ins>{memory_type.capitalize()}</ins>"))
    print_memory_addresses(memory)

### <ins>Normalized</ins>

Number of constructed addresses/abstract concepts:  18


Unnamed: 0,token,sim
0,house,0.782559
0,the,0.427431
0,red,0.370483
0,blue,0.183932
0,green,0.146675
0,yellow,0.118449


Unnamed: 0,token,sim
0,the,0.692903
0,red,0.692662
0,house,0.015816
0,green,0.013204
0,yellow,0.008608
0,blue,-0.00209


Unnamed: 0,token,sim
0,the,0.704527
0,green,0.702559
0,yellow,0.005492
0,blue,-0.003546
0,house,-0.006052
0,red,-0.008179


Unnamed: 0,token,sim
0,the,0.706042
0,blue,0.70532
0,house,0.020538
0,yellow,0.009622
0,green,-0.009937
0,red,-0.027966


Unnamed: 0,token,sim
0,the,0.693032
0,red,0.692598
0,green,0.011498
0,house,0.009419
0,yellow,0.00828
0,blue,-0.003616


Unnamed: 0,token,sim
0,the,0.69891
0,yellow,0.698225
0,house,0.026686
0,blue,0.023539
0,green,0.014271
0,red,-0.003595


Unnamed: 0,token,sim
0,the,0.706014
0,blue,0.705366
0,house,0.016972
0,yellow,0.008552
0,green,-0.009928
0,red,-0.029058


Unnamed: 0,token,sim
0,house,0.715187
0,blue,0.714496
0,yellow,0.048952
0,red,0.004839
0,the,-0.001454
0,green,-0.015389


Unnamed: 0,token,sim
0,the,0.706035
0,blue,0.705346
0,house,0.016603
0,yellow,0.00854
0,green,-0.009921
0,red,-0.029028


Unnamed: 0,token,sim
0,house,0.715204
0,blue,0.71448
0,yellow,0.048962
0,red,0.004888
0,the,-0.001818
0,green,-0.015384


Unnamed: 0,token,sim
0,red,0.692823
0,the,0.692817
0,green,0.011532
0,yellow,0.007246
0,house,0.006628
0,blue,-0.004311


Unnamed: 0,token,sim
0,house,0.709946
0,red,0.709913
0,yellow,0.047873
0,blue,0.014083
0,green,0.005626
0,the,-0.027337


Unnamed: 0,token,sim
0,red,0.692823
0,the,0.692818
0,green,0.011537
0,yellow,0.007237
0,house,0.006354
0,blue,-0.004317


Unnamed: 0,token,sim
0,house,0.709941
0,red,0.709918
0,yellow,0.047879
0,blue,0.014084
0,green,0.005629
0,the,-0.027605


Unnamed: 0,token,sim
0,red,0.692822
0,the,0.692819
0,green,0.011541
0,yellow,0.007229
0,house,0.006123
0,blue,-0.004322


Unnamed: 0,token,sim
0,house,0.709936
0,red,0.709923
0,yellow,0.047884
0,blue,0.014085
0,green,0.005631
0,the,-0.02783


Unnamed: 0,token,sim
0,red,0.692821
0,the,0.69282
0,green,0.011544
0,yellow,0.007222
0,house,0.005927
0,blue,-0.004327


Unnamed: 0,token,sim
0,house,0.709933
0,red,0.709927
0,yellow,0.047889
0,blue,0.014085
0,green,0.005633
0,the,-0.028021


### <ins>Unnormalized</ins>

Number of constructed addresses/abstract concepts:  12


Unnamed: 0,token,sim
0,house,0.635512
0,red,0.528618
0,the,0.517176
0,blue,0.17797
0,yellow,0.061964
0,green,0.053531


Unnamed: 0,token,sim
0,red,0.693047
0,the,0.692552
0,house,0.013339
0,green,0.012065
0,yellow,0.00787
0,blue,-0.002322


Unnamed: 0,token,sim
0,the,0.705472
0,green,0.701625
0,yellow,0.004768
0,blue,-0.003785
0,red,-0.006847
0,house,-0.008545


Unnamed: 0,token,sim
0,house,0.702627
0,green,0.698773
0,yellow,0.04604
0,red,0.027241
0,blue,0.014698
0,the,-0.002874


Unnamed: 0,token,sim
0,the,0.706587
0,blue,0.704778
0,house,0.019237
0,yellow,0.008971
0,green,-0.0099
0,red,-0.02658


Unnamed: 0,token,sim
0,house,0.715722
0,blue,0.713945
0,yellow,0.04927
0,red,0.007174
0,the,0.000872
0,green,-0.015348


Unnamed: 0,token,sim
0,house,0.71018
0,red,0.709671
0,yellow,0.048164
0,blue,0.015446
0,green,0.005599
0,the,-0.025085


Unnamed: 0,token,sim
0,the,0.699636
0,yellow,0.697492
0,house,0.027024
0,blue,0.024304
0,green,0.014273
0,red,-0.001987


Unnamed: 0,token,sim
0,house,0.72026
0,yellow,0.717783
0,blue,0.041653
0,red,0.031436
0,green,0.008314
0,the,-0.014338


Unnamed: 0,token,sim
0,house,0.587016
0,blue,0.58356
0,the,0.571832
0,yellow,0.026454
0,red,-0.017457
0,green,-0.01832


Unnamed: 0,token,sim
0,house,0.588192
0,red,0.565347
0,the,0.559937
0,yellow,0.025697
0,blue,0.009336
0,green,-0.001152


Unnamed: 0,token,sim
0,red,0.692948
0,the,0.692693
0,green,0.011542
0,yellow,0.007241
0,house,0.006275
0,blue,-0.004319
