# Hyperwave: knowledge base embedding

Let's import all the libraries needed for our knowledge base embedding.

In [1]:
import os
import numpy as np
import xarray as xr
import warnings
import torchhd
import torch

from urllib3.exceptions import NotOpenSSLWarning




Now, let's configure the runtime environment settings.

In [2]:
# I/O configuration
input_folder = '../data/ncdf'
output_folder = '../data/ncdf'

# Runtime configuration
os.makedirs(output_folder, exist_ok=True)
# np.set_printoptions(suppress=True, precision=8)
# torch.set_printoptions(precision=8, sci_mode=False)

# Ignore annoying urllib3 warnings
warnings.filterwarnings("ignore", category=NotOpenSSLWarning)

# Load dataset
dataset = xr.load_dataarray(os.path.join(input_folder, "nrm.orx.dataset.nc"))

Other than simulated signals, which are by themselves hypervectors of 20'880 dimensions, we need two kind of additional hypervectors: a random set to label central frequencies, and another one to label radii sizes.

In [None]:
no_radii = dataset.radius.size
no_frequencies = dataset.frequency.size
embedding_size = 1024
random_embedding_size = 1024
vsa_encoding = "BSC"

def embed(data, in_features, out_features, embed_type="random", vsa="BSC"):
    mapping = {
        "random": torchhd.embeddings.Random,
        "level": torchhd.embeddings.Level,
        "density": torchhd.embeddings.Density
    }
    if embed_type not in mapping:
        raise ValueError(f"Unknown embedding type: {embed_type}")
    embedding = mapping[embed_type](in_features, out_features, vsa=vsa)
    return embedding(data)

frequency_embeddings = embed(
    torch.randn(no_frequencies, random_embedding_size),
    in_features=random_embedding_size,
    out_features=embedding_size,
    embed_type="density",
    vsa=vsa_encoding)

radius_embeddings = embed(
    torch.randn(no_radii, random_embedding_size),
    in_features=random_embedding_size,
    out_features=embedding_size,
    embed_type="density",
    vsa=vsa_encoding)

print(f"Frequencies embeddings: {np.shape(frequency_embeddings)}")
print(f"Radii embeddings: {np.shape(radius_embeddings)}")
print(frequency_embeddings)


Now we should embed our simulated signals into a suitable encoding. Since we have used HRR for both central frequencies and radii labels, we will do the same for our signals.

In [None]:
testset_embeddings = torch.tensor(dataset.sel(radius=300).values)

In [None]:
signal_embeddings = xr.apply_ufunc(
    embed,
    testset_embeddings,
    # torch.tensor(dataset.values),
    dataset.time.size,
    embedding_size,
    "density",
    vsa_encoding,
    vectorize=True,   # Allows element-wise mapping
)
print(f"Signals embeddings: {np.shape(signal_embeddings)}")
print(signal_embeddings)

Now that we have all the embeddings for our dataset, we can incrementally try some experiments. For example, let's create a knowledge base hypervector binding central frequency labels with the corresponding simulated signal:

In [None]:
# Create the kbe (knowledge base embedding) for radius=10
# Operations syntax:                   bind := * | bundle := +
# kbe[radius=10] :=
#   [frequency=1MHz] * [signal=(radius=10,frequency=1MHz)    +
#   [frequency=1MHz] * [signal=(radius=10,frequency=2MHz)    +
#   ...
#   [frequency=10MHz] * [signal=(radius=10,frequency=9MHz)]  +
#   [frequency=10MHz] * [signal=(radius=10,frequency=10MHz)]

kbe = torchhd.hash_table(frequency_embeddings, signal_embeddings)

# Let's assume we want to know what is the signal like at 4MHz
query_embedding = torchhd.inverse(frequency_embeddings[3])
result = torchhd.bind(kbe, query_embedding)
# Now let's perform a memory cleanup: the result vector should be the
# closest to our simulated signal at frequency=4MHz
print(torchhd.cleanup(result, signal_embeddings))

torchhd.utils.plot_similarity(result, signal_embeddings)


We can see that the result has most similarity with element in position 3, i.e. with the signal having central frequency equal to 4MHz.

In [None]:
no_simulations = 1000
correct_validations = 0
embedding_size = 1024
random_embedding_size = 1024
vsa_encoding = "BSC"

no_simulations = 10
correct_validations = 0
for i in np.arange(no_simulations):
    frequency_embeddings = embed(
        torch.randn(no_frequencies, random_embedding_size),
        in_features=random_embedding_size,
        out_features=embedding_size,
        embed_type="density",
        vsa=vsa_encoding)
    query_index = 0   # Our query will be random_keys_embeddings[0]
    query_embedding = frequency_embeddings[query_index]
    correct_result = signal_embeddings[query_index]
    kbe = torchhd.hash_table(frequency_embeddings, signal_embeddings)
    # Let's assume we want to know what is the signal like at 4MHz
    query_embedding = torchhd.inverse(query_embedding)
    result = torchhd.bind(kbe, query_embedding)
    nearest_match = torchhd.cleanup(result, signal_embeddings)
    # nearest_match = nearest_inner_prod(result, frequency_embeddings)
    if (nearest_match == correct_result).all():
        correct_validations += 1
print(f"Query matching accuracy: {correct_validations / no_simulations * 100}%")

    

In [None]:
no_keys, no_values = 20, 20
embedding_size = 1024
random_embedding_size = 1024
vsa_encoding = "BSC"

random_keys_embeddings = embed(
    torch.randn(no_keys, random_embedding_size),
    in_features=random_embedding_size,
    out_features=embedding_size,
    embed_type="density",
    vsa=vsa_encoding
)
random_values_embeddings = embed(
    torch.randn(no_values, random_embedding_size),
    in_features=random_embedding_size,
    out_features=embedding_size,
    embed_type="density",
    vsa=vsa_encoding
)
kbe = torchhd.hash_table(random_keys_embeddings, random_values_embeddings)
query_embedding = torchhd.inverse(random_keys_embeddings[0])
result = torchhd.bind(kbe, query_embedding)
torchhd.utils.plot_similarity(result, random_values_embeddings)

Let's create a small experiment where we embed a knowledge base using random components for both keys and values of our dictionary data structure. In this way, we can compare the baseline accuracy of a random model w.r.t. our signal density projection embedding.

In [None]:
no_simulations = 10
correct_validations = 0
no_keys, no_values = 10, 10
embedding_size = 1024
random_embedding_size = 1024
vsa_encoding = "BSC"

for i in np.arange(no_simulations):
    random_keys_embeddings = embed(
        torch.randn(no_keys, random_embedding_size),
        in_features=random_embedding_size,
        out_features=embedding_size,
        embed_type="density",
        vsa=vsa_encoding
    )
    random_values_embeddings = embed(
        torch.randn(no_values, random_embedding_size),
        in_features=random_embedding_size,
        out_features=embedding_size,
        embed_type="density",
        vsa=vsa_encoding
    )
    query_index = 0   # Our query will be random_keys_embeddings[0]
    query_embedding = random_keys_embeddings[query_index]
    correct_result = random_values_embeddings[query_index]
    kbe = torchhd.hash_table(random_keys_embeddings, random_values_embeddings)
    query_embedding = torchhd.inverse(query_embedding)
    result = torchhd.bind(kbe, query_embedding)
    nearest_match = torchhd.cleanup(result, random_values_embeddings)
    if (nearest_match == correct_result).all():
        correct_validations += 1
print(f"Query matching accuracy: {correct_validations / no_simulations * 100}%")

The baseline accuracy of value retrieval using as keys signals embeddings or random samples is very similar; basically, we just lose ~1% of accuracy when using signals embeddings, probably because the density encoding still leaves some residuals of time-dependent correlations between datapoints. Now let's try to embed the whole dataset into a single knowledge base.

In [None]:
no_radii = dataset.radius.size
no_frequencies = dataset.frequency.size
embedding_size = 32768
random_embedding_size = 1024
vsa_encoding = "BSC"

In [None]:
frequency_embeddings = embed(
    torch.randn(no_frequencies, random_embedding_size),
    in_features=random_embedding_size,
    out_features=embedding_size,
    embed_type="density",
    vsa=vsa_encoding)

radius_embeddings = embed(
    torch.randn(no_radii, random_embedding_size),
    in_features=random_embedding_size,
    out_features=embedding_size,
    embed_type="density",
    vsa=vsa_encoding)

In [None]:
x = []
for f in dataset["frequency"].values:
    signal_embeddings_by_radius = xr.apply_ufunc(
        embed,
        torch.tensor(dataset.sel(frequency=f).values),
        dataset.time.size,
        embedding_size,
        "density",
        vsa_encoding,
        vectorize=True)
    kbe = torchhd.hash_table(radius_embeddings, signal_embeddings_by_radius)
    x.append(kbe)
print(x)

In [None]:
signal_embeddings = xr.apply_ufunc(
    embed,
    torch.tensor(dataset.values),
    dataset.time.size,
    embedding_size,
    "density",
    vsa_encoding,
    vectorize=True,   # Allows element-wise mapping
)
print(f"Signals embeddings: {np.shape(signal_embeddings)}")