# DSDM 



In [1]:
from IPython.display import display, Markdown as md
import ipywidgets as widgets
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import random

import pandas as pd
import pathlib
from preprocess import preprocess_text

from sklearn.metrics import pairwise_distances
from sklearn.neighbors import LocalOutlierFactor

import torch
import torchhd as thd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F 
from tqdm import tqdm
from typing import List

In [3]:
def fix_seed():
    seed = 42
    print("[ Using Seed : ", seed, " ]")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    numpy.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def load_data(path, bs=0, shuffle=False):
    """Load data from file path."""
    text = pathlib.Path(path).read_text(encoding='utf-8')
    return text.splitlines()


def compute_distances_gpu(X, Y):
    """Compute Euclidean distance."""
    return torch.sqrt(-2 * torch.mm(X,Y.T) +
                    torch.sum(torch.pow(Y, 2),dim=1) +
                    torch.sum(torch.pow(X, 2),dim=1).view(-1,1))

In [2]:
# Fix seed.
fix_seed()

# Set device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dim = 2000 # Vector dimension


cleanup = {} # Cleanup memory for saving atomic HVs

In [10]:
# DSDM class
class SONN(nn.Module):
    def __init__(self, address_size, ema_time_period, learning_rate_update, temperature, normalize=False):
        super(SONN, self).__init__()
        self.address_size = address_size
        self.addresses = torch.tensor([]).to(device)

        self.normalize = normalize

        self.ema = 0
        self.ema_time_period = ema_time_period
        self.ema_temperature = 2 / (self.ema_time_period + 1)
        
        self.learning_rate_update = learning_rate_update

        self.temperature = temperature
        
        
    def retrieve(self, query_address):
        with torch.no_grad():
            retrieved_content = torch.tensor([]).to(device)

            cos = torch.nn.CosineSimilarity()
            # Calculate the cosine similarities.
            if self.normalize: 
                similarities = cos(self.addresses.sgn(), query_address.sgn())
            else:
                similarities = cos(self.addresses, query_address)
            # Cosine distance tensor
            distances = 1 - similarities

            # Calculate the softmin weights.
            softmin_weights = F.softmin(distances/self.temperature, dim=-1)

            # Weight the memory addresses with the softmin weights.
            weighted_addresses = torch.matmul(softmin_weights, self.addresses.to(device)).view(-1)

            # Pool the weighted memory addresses to create the output.
            retrieved_content = torch.sum(weighted_addresses.view(1, -1), 0)

        return retrieved_content   

    
    def save(self, query_address):
        # The memory is instantiated with the first observation.
        if self.addresses.shape[0] == 0:
            self.addresses = torch.cat((self.addresses, query_address.view(1, -1)))
            
            return
        
        cos = torch.nn.CosineSimilarity()
        # Calculate the cosine similarities.
        if self.normalize: 
            similarities = cos(self.addresses.sgn(), query_address.sgn())
        else:
            similarities = cos(self.addresses, query_address)

        # Calculate the cosine distances.
        distances = 1 - similarities
        # Get the minimum distance and the corresponding address index.  
        min_distance = torch.min(distances, dim=0)[0].item()
        
        # Calculate EMA for current chunk.
        self.ema += self.ema_temperature * (min_distance - self.ema)
        
        # Check if the minimum distance is bigger than the adaptive threshold.
        if min_distance > self.ema: # If the minimum distance is bigger, create a new address.
            # Add a new entry to the address matrix/tensor equal to the target address.
            self.addresses = torch.cat((self.addresses, query_address.view(1, -1)))
        else: # If the minimum distance is smaller or equal, update the memory addresses.
            # Apply the softmin function to the distance tensor the get the softmin weights.
            softmin_weights = F.softmin(distances/self.temperature, dim=-1)
            # Update the memory address space.
            self.addresses += self.learning_rate_update * torch.mul(softmin_weights.view(-1, 1), query_address - self.addresses)
         

        return

In [5]:
def generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(tokens: List[str]) -> None:
    global cleanup, dim

    for token in tokens:
        # Check if the token has been encountered before by querying the cleanup memory.
        entry = cleanup.get(token)
        # If it hasn't, 
        if entry == None:
            # Generate a random HV representation for the token.
            atomic_HV = thd.MAPTensor.random(1, dim)[0]
            # Add the HV to the cleanup memory.
            cleanup[token] = atomic_HV
    
    return


def generate_chunk_representations_and_save_them_to_memory(memory, tokens, chunk_lengths=[], output=False):
    # "n" represents the no. of tokens in the sentence, which is also the max. no. of tokens 
    # that can be grouped to form a chunk.
    n = len(tokens)
    chunk_lengths = np.array(chunk_lengths, dtype=int)

    # Generate all possible chunks.
    if len(chunk_lengths) == 0:
        chunk_lengths = np.arange(1, n +  1)
    else:
        # Remove lengths which are bigger than the maximum chunk length.
        chunk_lengths = chunk_lengths[chunk_lengths <= n]
   
    for no_tokens in chunk_lengths:
        if output:
            print("no. of tokens: ", no_tokens)
        for i in range(n):
            if output:
                print("start index: ", i)
            # If there are not enough tokens left to construct a chunk comprised of "no_tokens", break. 
            if i + no_tokens > len(tokens):
                if output:
                    print("Not enough tokens left.")
                break 
            HC_representation = thd.MAPTensor.empty(1, dim)[0]

            # Construct HC representation.
            for j in range(no_tokens):
                if output:
                    print(tokens[i + j])
                HC_representation += cleanup[tokens[i + j]]

            # Save the chunk HC representation to memory.
            memory.save(HC_representation)

    return


def generate_query(tokens: list):
    n = len(tokens)
    HC_representation = thd.MAPTensor.empty(1, dim)

    for i in range(n):
    # The token hasn't been encountered before.
    if cleanup.get(tokens[i]) == None:
        # Generate an atomic HC for the unencountered token.
        atomic_HC = thd.MAPTensor.random(1, dim)[0]
        # Add the atomic HC to the cleanup memory.
        cleanup[tokens[i]] = atomic_HC
        # Add the atomic (i.e., superpose) HC to the chunk HC representation.
        HC_representation += atomic_HC
    # The token has been encountered before.
    else:
        HC_representation += cleanup[tokens[i]]

    return HC_representation

In [6]:
# Comment: Pruning code copied of original DSDM.
def prune(self):
    N_pruning = self.N_prune  # Maximum no. of (address) nodes the memory can have. 
    n_class = self.M.size(1)
    # If the maximum number of nodes has been reached, apply LOF
    # to get normalcy scores.
    if len(self.Address) > N_pruning:   
        clf = LocalOutlierFactor(n_neighbors=min(len(self.Address), self.n_neighbors), contamination=self.contamination)
        A = self.Address
        M = self.M
        y_pred = clf.fit_predict(A.cpu())
        X_scores = clf.negative_outlier_factor_
        x_scor = torch.tensor(X_scores)

        # "Naive" pruning mode.
        if self.prune_mode == "naive":
            if len(A) > N_pruning:
                prun_N_addr = len(A) - N_pruning # No. of addresses that must be pruned out.
                val, ind = torch.topk(x_scor, prun_N_addr) 
                idx_remove = [True] * len(A)
                for i in ind:
                    idx_remove[i] = False
                self.M = self.M[idx_remove] # Delete content from address.
                self.Address = self.Address[idx_remove] # Delete address.

        # "Balance" pruning mode.
        # Idea: Prune from each class instead of the nodes with the highest densities.
        if self.prune_mode == "balance":
            prun_N_addr = len(A) - N_pruning  # No. of addresses that must be pruned out.
            mean_addr = N_pruning // n_class  # Max. number of allowed nodes per class.
            val, ind = torch.sort(x_scor, descending=True)

            count = prun_N_addr
            idx_remove = [True] * len(A)
            idx = 0
            arg_m = torch.argmax(M, axis=1)  # Get predicted class.
            N_remaining = torch.bincount(arg_m)  # Count the frequency of each value, i.e., no. of predictions for each class.
            while count != 0:
                idx +=1
                indice = ind[idx]
                if N_remaining[arg_m[indice]] > (N_pruning // n_class):
                    N_remaining[arg_m[indice]] -= 1
                    idx_remove[ind[idx]] = False
                    count-=1
            self.M = self.M[idx_remove]
            self.Address = self.Address[idx_remove]
    return

## Run experiment

In [None]:
def get_similarities_to_atomic_HVs(memory, sentence):
    global sims_df
    retrieved_content = memory.retrieve(generate_query(preprocess_text(sentence)))


    for token, atomic_HC in cleanup.items():
        sims_df = pd.concat([sims_df, pd.DataFrame([{'sentence': sentence,
                                                     'token': token,
                                                     'similarity': thd.cosine_similarity(atomic_HC, retrieved_content).item()}])])

    return

def get_most_similar_HVs(sims_df, delta_threshold=0.15):
    # Sort values: This is needed since similarity_next makes sense only in the context of a sort df.
    df = sims_df.sort_values('similarity', ascending=False).reset_index(drop=True).copy()
    # Add column with the previous token's similarity.
    df['previous_token_similarity'] = df['similarity'].shift(1).values
    # Compute the differece between the similarities. 
    df['delta'] = df['previous_token_similarity'] - df['similarity']
    # Set the NaN value of the delta to '0', since the first token doesn't have a previous token.
    df['delta'] = df['delta'].fillna(0)
    # Get index of the first element whose delta is bigger than delta_threshold.
    # TODO: Consider - This might have the edge case of all the deltas decreasing by delta_threshold.
    unsimilar_df = df[df['delta'] > delta_threshold].head(1)
    # We initially assume that all the tokens are equally represented.
    idx_cut_in = len(unsimilar_df)
    if len(unsimilar_df) > 0:
        idx_cut_in = df[df['delta'] > delta_threshold].head(1).index[0]
    # Subdataframe with only the most similar tokens.
    most_similar_tokens_df = df.head(idx_cut_in)
    
    # Get concept as a string.
    concept = most_similar_tokens_df['token'].values
    concept.sort()
    #print(concept)
    #display(df)
    return concept 
    

def display_and_get_memory_addresses(memory):
    print("Number of constructed addresses/abstract concepts: ", len(memory.addresses))

    concepts_df = pd.DataFrame(columns=['memory_address', 'memory_concept'])
    
    for address in memory.addresses:
        sims_df = pd.DataFrame(columns=['token', 'similarity'])
        for key, item in cleanup.items():
            sims_df = pd.concat([sims_df, pd.DataFrame([{'token': key, 'similarity': thd.cosine_similarity(item,  address).item()}])])
        
        display(sims_df.sort_values('similarity', ascending=False).reset_index(drop=True))
        concept = get_most_similar_HVs(sims_df)
        concepts_df = pd.concat([concepts_df, pd.DataFrame([{'memory_address': address, 'memory_concept': concept}])])
    
    
    
    concepts_df = concepts_df.reset_index(drop=True)
    #display(concepts_df)
    #display(sims_df.sort_values('similarity', ascending=False).reset_index(drop=True))
    return concepts_df

In [None]:
# DSDM hyperparameters
address_size = dim
ema_time_period = 500  # No. of days in the EMA, i.e., maximum number of save operations to be performed.
learning_rate_update = 0.004
temperature = 2.3

# Create DSDM instances.
memory_unnormalized = SONN(address_size=address_size, ema_time_period=ema_time_period, learning_rate_update=learning_rate_update, temperature=temperature)
memory_normalized = SONN(address_size=address_size, ema_time_period=ema_time_period, learning_rate_update=learning_rate_update, temperature=temperature, normalize=True)

memories = {"normalized": memory_normalized, "unnormalized": memory_unnormalized}

In [7]:
# Load initial training data.
lines_raw = load_data('../data/initial_training_data.txt')

# Preprocess initial training data. 
lines_tokens = []
for line_raw in lines_raw:
    # Account for empty lines.
    if line_raw.rstrip():
        lines_tokens.append(preprocess_text(line_raw))

# Flush cleanup memory.
cleanup = {}

# Train memories (normalized & unnormalized) with initial trianing data.
for sentence_tokens in lines_tokens:
    generate_atomic_HVs_from_tokens_and_add_them_to_cleanup(sentence_tokens)
    for _, memory in memories.items():
        # Construct the chunks of each sentence and save them to each memory.
        generate_chunk_representations_and_save_them_to_memory(memory, sentence_tokens, chunk_lengths=[1, 2, 3])

### Inference

In [8]:
out1, out2 = widgets.Output(), widgets.Output()  # Output widgets

for out, (memory_type, memory) in zip([out1, out2], memories.items()):
    sims_df = pd.DataFrame(columns=['sentence', 'token', 'similarity'])
    
    with out:
        display(md(f"### <ins>{memory_type.capitalize()}</ins>"))
        get_similarities_to_atomic_HVs(memory, "The red house.")
        get_similarities_to_atomic_HVs(memory, "The house.")
        get_similarities_to_atomic_HVs(memory, "House.")
        get_similarities_to_atomic_HVs(memory, "The purple house.")

        display(sims_df.sort_values(['sentence', 'similarity'], ascending=False).set_index(['sentence', 'token']))
        
widgets.HBox([out1, out2])  # Display outout.

HBox(children=(Output(), Output()))

### Memory state

In [9]:
for memory_type, memory in memories.items():
    display(md(f"### <ins>{memory_type.capitalize()}</ins>"))
    concepts_df = display_and_get_memory_addresses(memory)
    
    concepts_df['memory_concept_str'] = concepts_df['memory_concept'].apply(lambda concept_list: " ".join(concept_list))
    print(concepts_df['memory_concept_str'].values)
    
    # Get cosine similarties of the memory addresses mapping to the same concept.
    tmp_df = pd.DataFrame(concepts_df.groupby('memory_concept_str')['memory_address'].apply(list)).reset_index()
    for i in range(len(tmp_df)):
        address_list = tmp_df['memory_address'][i]
        
        if len(address_list) > 1: 
            stacked_tensor = torch.stack(address_list, dim=0)
            pairwise_similarities = torch.nn.functional.cosine_similarity(stacked_tensor.unsqueeze(1), stacked_tensor.unsqueeze(0), dim=2)
            print(pairwise_similarities)
            # Why are they all identical? 

### <ins>Normalized</ins>

Number of constructed addresses/abstract concepts:  18


Unnamed: 0,token,similarity
0,the,0.999998
1,green,0.014021
2,house,-0.014298
3,blue,-0.019034
4,red,-0.055088


Unnamed: 0,token,similarity
0,red,0.999997
1,green,0.010036
2,house,-0.004311
3,blue,-0.037036
4,the,-0.054271


Unnamed: 0,token,similarity
0,house,0.999998
1,green,0.006035
2,blue,0.000931
3,red,-0.005153
4,the,-0.014263


Unnamed: 0,token,similarity
0,the,0.687587
1,red,0.686456
2,green,0.017477
3,house,-0.014734
4,blue,-0.040745


Unnamed: 0,token,similarity
0,house,0.705361
1,red,0.704602
2,green,0.011366
3,blue,-0.025547
4,the,-0.049681


Unnamed: 0,token,similarity
0,house,0.580109
1,red,0.555793
2,the,0.550514
3,green,0.017789
4,blue,-0.032603


Unnamed: 0,token,similarity
0,green,0.999998
1,blue,0.044946
2,the,0.015078
3,red,0.010789
4,house,0.007621


Unnamed: 0,token,similarity
0,the,0.712554
1,green,0.711523
2,blue,0.018199
3,house,-0.006149
4,red,-0.031676


Unnamed: 0,token,similarity
0,house,0.709742
1,green,0.708706
2,blue,0.032362
3,red,0.003481
4,the,-0.000545


Unnamed: 0,token,similarity
0,green,0.587594
1,the,0.575675
2,house,0.571088
3,blue,0.015519
4,red,-0.029463


Unnamed: 0,token,similarity
0,blue,0.999999
1,green,0.045028
2,house,0.002117
3,the,-0.018267
4,red,-0.036209


Unnamed: 0,token,similarity
0,the,0.700746
1,blue,0.699967
2,green,0.042123
3,house,-0.010025
4,red,-0.065692


Unnamed: 0,token,similarity
0,house,0.707861
1,blue,0.707058
2,green,0.036047
3,the,-0.024097
4,red,-0.029704


Unnamed: 0,token,similarity
0,house,0.57546
1,blue,0.5731
2,the,0.563748
3,green,0.037953
4,red,-0.057275


Unnamed: 0,token,similarity
0,red,0.687035
1,the,0.68701
2,green,0.01747
3,house,-0.015442
4,blue,-0.040756


Unnamed: 0,token,similarity
0,house,0.704988
1,red,0.704977
2,green,0.011356
3,blue,-0.025544
4,the,-0.050518


Unnamed: 0,token,similarity
0,red,0.687023
1,the,0.687022
2,green,0.017468
3,house,-0.015848
4,blue,-0.040756


Unnamed: 0,token,similarity
0,house,0.704985
1,red,0.704979
2,green,0.01135
3,blue,-0.025536
4,the,-0.050906


['the' 'red' 'house' 'red the' 'house red' 'house red the' 'green'
 'green the' 'green house' 'green house the' 'blue' 'blue the'
 'blue house' 'blue house the' 'red the' 'house red' 'red the' 'house red']
MAPTensor([[1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000]])
MAPTensor([[1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000]])


### <ins>Unnormalized</ins>

Number of constructed addresses/abstract concepts:  14


Unnamed: 0,token,similarity
0,the,0.999995
1,green,0.014038
2,house,-0.01369
3,blue,-0.019082
4,red,-0.05382


Unnamed: 0,token,similarity
0,red,0.999994
1,green,0.010053
2,house,-0.003526
3,blue,-0.037052
4,the,-0.053465


Unnamed: 0,token,similarity
0,house,0.999995
1,green,0.006057
2,blue,0.000871
3,red,-0.003904
4,the,-0.013684


Unnamed: 0,token,similarity
0,the,0.687236
1,red,0.686807
2,green,0.017479
3,house,-0.014184
4,blue,-0.040751


Unnamed: 0,token,similarity
0,house,0.705179
1,red,0.704783
2,green,0.011374
3,blue,-0.025564
4,the,-0.049222


Unnamed: 0,token,similarity
0,house,0.579967
1,red,0.556058
2,the,0.550391
3,green,0.017789
4,blue,-0.03261


Unnamed: 0,token,similarity
0,green,0.999994
1,blue,0.044893
2,the,0.015633
3,red,0.011837
4,house,0.008195


Unnamed: 0,token,similarity
0,the,0.712759
1,green,0.711316
2,blue,0.018154
3,house,-0.005776
4,red,-0.030871


Unnamed: 0,token,similarity
0,house,0.709974
1,green,0.708473
2,blue,0.032314
3,red,0.004248
4,the,-0.00021


Unnamed: 0,token,similarity
0,green,0.587312
1,the,0.575806
2,house,0.571239
3,blue,0.015474
4,red,-0.028717


Unnamed: 0,token,similarity
0,blue,0.999996
1,green,0.045053
2,house,0.00269
3,the,-0.01773
4,red,-0.035183


Unnamed: 0,token,similarity
0,the,0.700981
1,blue,0.69973
2,green,0.042128
3,house,-0.009668
4,red,-0.064924


Unnamed: 0,token,similarity
0,house,0.708116
1,blue,0.706802
2,green,0.036052
3,the,-0.023764
4,red,-0.028937


Unnamed: 0,token,similarity
0,house,0.575625
1,blue,0.572788
2,the,0.563893
3,green,0.037952
4,red,-0.056512


['the' 'red' 'house' 'red the' 'house red' 'house red the' 'green'
 'green the' 'green house' 'green house the' 'blue' 'blue the'
 'blue house' 'blue house the']
