# DSDM 



In [1]:
from IPython.display import display, Markdown as md
import ipywidgets as widgets
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import random

import pandas as pd
import pathlib
from preprocess import preprocess_text

from sklearn.neighbors import LocalOutlierFactor

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 
from tqdm import tqdm
from typing import List

In [2]:
# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Vector dimension
dim = 2000 

cleanup = {}

In [3]:
def fix_seed():
    seed = 42
    print("[ Using Seed : ", seed, " ]")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    numpy.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def load_data(path, bs=0, shuffle=False):
    """Load data from file path."""
    text = pathlib.Path(path).read_text(encoding='utf-8')
    return text.splitlines()


def compute_distances_gpu(X, Y):
    """Compute Euclidean distance."""
    return torch.sqrt(-2 * torch.mm(X,Y.T) +
                    torch.sum(torch.pow(Y, 2),dim=1) +
                    torch.sum(torch.pow(X, 2),dim=1).view(-1,1))

In [4]:
class SONN(nn.Module):
    def __init__(self, address_size, ema_time_period, learning_rate_update, temperature, normalize=False):
        super(SONN, self).__init__()
        self.address_size = address_size
        self.addresses = torch.zeros(1, address_size).to(device)

        self.normalize = normalize

        self.ema = 0
        self.ema_time_period = ema_time_period
        self.ema_temperature = 2 / (self.ema_time_period + 1)
        
        self.learning_rate_update = learning_rate_update

        self.temperature = temperature
        
    def reset(self):
        self.ema = 0
        self.addresses = torch.zeros(1, self.address_size).to(device)
        
    def retrieve(self, query_address):
        with torch.no_grad():
            retrieved_content = torch.tensor([]).to(device)

            cos = torch.nn.CosineSimilarity()
            # Calculate the cosine similarities.
            if self.normalize: 
                similarities = cos(self.addresses.sgn(), query_address.sgn())
            else:
                similarities = cos(self.addresses, query_address)
            # Cosine distance tensor
            distances = 1 - similarities

            # Calculate the softmin weights.
            softmin_weights = F.softmin(distances/self.temperature, dim=-1)

            # Weight the memory addresses with the softmin weights.
            weighted_addresses = torch.matmul(softmin_weights, self.addresses.to(device)).view(-1)

            # Pool the weighted memory addresses to create the output.
            retrieved_content = torch.sum(weighted_addresses.view(1, -1), 0)

        return retrieved_content   

    
    def save(self, query_address):
        cos = torch.nn.CosineSimilarity()
        # Calculate the cosine similarities.
        if self.normalize: 
            similarities = cos(self.addresses.sgn(), query_address.sgn())
        else:
            similarities = cos(self.addresses, query_address)

        # Calculate the cosine distances.
        distances = 1 - similarities
        # Get the minimum distance and the corresponding address index.  
        min_distance = torch.min(distances, dim=0)[0].item()
        
        # Calculate EMA for current chunk.
        self.ema += self.ema_temperature * (min_distance - self.ema)
        
        # Check if the minimum distance is bigger than the adaptive threshold.
        if min_distance > self.ema: # If the minimum distance is bigger, create a new address.
            # Add a new entry to the address matrix/tensor equal to the target address.
            self.addresses = torch.cat((self.addresses, query_address.view(1, -1)))
        else: # If the minimum distance is smaller or equal, update the memory addresses.
            # Apply the softmin function to the distance tensor the get the softmin weights.
            softmin_weights = F.softmin(distances/self.temperature, dim=-1)
            # Update the memory address space.
            self.addresses += self.learning_rate_update * torch.mul(softmin_weights.view(-1, 1), query_address - self.addresses)
         

        return

In [5]:
def generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(tokens: List[str]) -> None:
    global cleanup, dim

    for token in tokens:
        # Check if the token has been encountered before by querying the cleanup memory.
        entry = cleanup.get(token)
        # If it hasn't, 
        if entry == None:
            # Generate a random HV representation for the token.
            atomic_HV = thd.MAPTensor.random(1, dim)[0]
            # Add the HV to the cleanup memory.
            cleanup[token] = atomic_HV
    
    return


def generate_chunk_representations_and_save_them_to_memory(memory, tokens, chunk_lengths=[], output=False):
    # "n" represents the no. of tokens in the sentence, which is also the max. no. of tokens 
    # that can be grouped to form a chunk.
    n = len(tokens)
    chunk_lengths = np.array(chunk_lengths, dtype=int)

    # Generate all possible chunks.
    if len(chunk_lengths) == 0:
        chunk_lengths = np.arange(1, n +  1)
    else:
        # Remove lengths which are bigger than the maximum chunk length.
        chunk_lengths = chunk_lengths[chunk_lengths <= n]
   
    for no_tokens in chunk_lengths:
        if output:
            print("no. of tokens: ", no_tokens)
        for i in range(n):
            if output:
                print("start index: ", i)
            # If there are not enough tokens left to construct a chunk comprised of "no_tokens", break. 
            if i + no_tokens > len(tokens):
                if output:
                    print("Not enough tokens left.")
                break 
            HC_representation = thd.MAPTensor.empty(1, dim)[0]

            # Construct HC representation.
            for j in range(no_tokens):
                if output:
                    print(tokens[i + j])
                HC_representation += cleanup[tokens[i + j]]

            # Save the chunk HC representation to memory.
            memory.save(HC_representation)

    return

def generate_query(tokens: list):
  n = len(tokens)
  HC_representation = thd.MAPTensor.empty(1, dim)

  for i in range(n):
    # The token hasn't been encountered before.
    if cleanup.get(tokens[i]) == None:
        # Generate an atomic HC for the unencountered token.
        atomic_HC = thd.MAPTensor.random(1, dim)[0]
        # Add the atomic HC to the cleanup memory.
        cleanup[tokens[i]] = atomic_HC
        # Add the atomic (i.e., superpose) HC to the chunk HC representation.
        HC_representation += atomic_HC
    # The token has been encountered before.
    else:
        HC_representation += cleanup[tokens[i]]

    return HC_representation

In [6]:
# Comment: Pruning code copied of original DSDM.
def prune(self):
        N_pruning = self.N_prune  # Maximum no. of (address) nodes the memory can have. 
        n_class = self.M.size(1)
        # If the maximum number of nodes has been reached, apply LOF
        # to get normalcy scores.
        if len(self.Address) > N_pruning:   
            clf = LocalOutlierFactor(n_neighbors=min(len(self.Address), self.n_neighbors), contamination=self.contamination)
            A = self.Address
            M = self.M
            y_pred = clf.fit_predict(A.cpu())
            X_scores = clf.negative_outlier_factor_
            x_scor = torch.tensor(X_scores)
            
            # "Naive" pruning mode.
            if self.prune_mode == "naive":
                if len(A) > N_pruning:
                    prun_N_addr = len(A) - N_pruning # No. of addresses that must be pruned out.
                    val, ind = torch.topk(x_scor, prun_N_addr) 
                    idx_remove = [True] * len(A)
                    for i in ind:
                        idx_remove[i] = False
                    self.M = self.M[idx_remove] # Delete content from address.
                    self.Address = self.Address[idx_remove] # Delete address.
                    
            # "Balance" pruning mode.
            # Idea: Prune from each class instead of the nodes with the highest densities.
            if self.prune_mode == "balance":
                prun_N_addr = len(A) - N_pruning  # No. of addresses that must be pruned out.
                mean_addr = N_pruning // n_class  # Max. number of allowed nodes per class.
                val, ind = torch.sort(x_scor, descending=True)

                count = prun_N_addr
                idx_remove = [True] * len(A)
                idx = 0
                arg_m = torch.argmax(M, axis=1)  # Get predicted class.
                N_remaining = torch.bincount(arg_m)  # Count the frequency of each value, i.e., no. of predictions for each class.
                while count != 0:
                    idx +=1
                    indice = ind[idx]
                    if N_remaining[arg_m[indice]] > (N_pruning // n_class):
                        N_remaining[arg_m[indice]] -= 1
                        idx_remove[ind[idx]] = False
                        count-=1
                self.M = self.M[idx_remove]
                self.Address = self.Address[idx_remove]
    return

## Run experiment

In [7]:
# Load data.
lines_raw = load_data('../data/data.txt')

# Preprocess input. 
lines_tokens = []
for line_raw in lines_raw:
    # Account for empty lines.
    if line_raw.rstrip():
        lines_tokens.append(preprocess_text(line_raw))


address_size = dim
ema_time_period = 500  # No. of days in the EMA, i.e., maximum number of save operations to be performed.
learning_rate_update = 0.004
temperature = 2.3

# Create DSDM instance.
memory_unnormalized = SONN(address_size=address_size, ema_time_period=ema_time_period, learning_rate_update=learning_rate_update, temperature=temperature)
memory_normalized = SONN(address_size=address_size, ema_time_period=ema_time_period, learning_rate_update=learning_rate_update, temperature=temperature, normalize=True)

memories = {"normalized": memory_normalized, "unnormalized": memory_unnormalized}

# Flush cleanup memory.
cleanup = {}

# Train memory.
for sentence_tokens in lines_tokens:
    generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(sentence_tokens)
    for _, memory in memories.items():
        generate_chunk_representations_and_save_them_to_memory(memory, sentence_tokens, chunk_lengths=[2, 3])

In [8]:
def get_similarities_to_atomic_HVs(memory, sentence):
    global sims_df
    retrieved_content = memory.retrieve(generate_query(preprocess_text(sentence)))


    for token, atomic_HC in cleanup.items():
        sims_df = pd.concat([sims_df, pd.DataFrame([{'sentence': sentence,
                                                     'token': token,
                                                     'similarity': thd.cosine_similarity(atomic_HC, retrieved_content).item()}])])

    return

out1, out2 = widgets.Output(), widgets.Output()  # Output widgets

for out, (memory_type, memory) in zip([out1, out2], memories.items()):
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity'])
    
    with out:
        display(md(f"### <ins>{memory_type.capitalize()}</ins>"))
        get_similarities_to_atomic_HVs(memory, "The red house.")
        get_similarities_to_atomic_HVs(memory, "The house.")
        get_similarities_to_atomic_HVs(memory, "House.")

        display(sims_df.sort_values(['sentence', 'similarity'], ascending=False).set_index(['sentence', 'token']))
        
widgets.HBox([out1, out2])  # Display outout.

HBox(children=(Output(), Output()))

In [41]:
def get_most_similar_HVs(sims_df, delta_threshold=0.15):
    # Sort values: This is needed since similarity_next makes sense only in the context of a sort df.
    df = sims_df.sort_values('similarity', ascending=False).reset_index(drop=True).copy()
    # Add column with the previous token's similarity.
    df['previous_token_similarity'] = df['similarity'].shift(1).values
    # Compute the differece between the similarities. 
    df['delta'] = df['previous_token_similarity'] - df['similarity']
    # Set the NaN value of the delta to '0', since the first token doesn't have a previous token.
    df['delta'] = df['delta'].fillna(0)
    # Get index of the first element whose delta is bigger than delta_threshold.
    # TODO: Consider - This might have the edge case of all the deltas decreasing by delta_threshold.
    idx_cut_in = df[df['delta'] > delta_threshold].head(1).index[0]
    
    # Subdataframe with only the most similar tokens.
    most_similar_tokens_df = df.head(idx_cut_in)
    
    # Get concept as a string.
    print(" ".join(most_similar_tokens_df['token']))
    #display(df)
    return 
    

def print_memory_addresses(memory):
    print("Number of constructed addresses/abstract concepts: ", len(memory.addresses))


    for address in memory.addresses:
        sims_df = pd.DataFrame(columns=['token', 'similarity'])
        for key, item in cleanup.items():
            sims_df = pd.concat([sims_df, pd.DataFrame([{'token': key, 'similarity': thd.cosine_similarity(item,  address).item()}])])
    
        get_most_similar_HVs(sims_df)       
    #display(sims_df.sort_values('similarity', ascending=False).reset_index(drop=True))
    return


for memory_type, memory in memories.items():
    display(md(f"### <ins>{memory_type.capitalize()}</ins>"))
    print_memory_addresses(memory)

### <ins>Normalized</ins>

Number of constructed addresses/abstract concepts:  26
the house red
the red
house red
the red house
the green
house green
green house the
the blue
house blue
blue house the
house red
the yellow
house yellow
the yellow house
the blue
house blue
the blue
house blue
the red
red house
the red
red house
the red
red house
the red
red house


### <ins>Unnormalized</ins>

Number of constructed addresses/abstract concepts:  13
red the house
red the
red house
the red house
the green
house green
green house the
the blue
house blue
blue house the
the yellow
house yellow
the yellow house
