# Associative Memory Network Dataset Generator

This Python notebook generates a series of `.tsv` files that contains training data for the numerical embedding network.

- `full-dataset.tsv`: addition and multiplication tables for 0–9
- `train-dataset-1.tsv`: addition table for 1
- `train-dataset-2.tsv`: addition table for 2
- `train-dataset.tsv` and `train-dataset.tsv`: addition and multiplication tables for 0–9, randomly separated into an 80/20 training/test split

This notebook requires embeddings from the embedding network to be included as `embeddings.csv`.

In [None]:
with open("embeddings.csv", 'r') as file:
  embeddings = file.read()
  embeddings = list(map(lambda e: e.split(","), embeddings.split("\n")))
  embeddings = list(map(lambda e: "\t".join(e[1:]), embeddings))

def number_to_binary_array(number, size):

    if not isinstance(number, int) or number < 0:
        raise ValueError("Input must be a non-negative integer.")

    # Convert number to binary string, remove '0b' prefix
    binary_str = bin(number)[2:]

    # Prepend '0's to make the string 7 bits long
    binary_str = binary_str.zfill(size)

    # Convert to list of integers
    binary_array = list(map(str, binary_str))

    string = "\t".join(binary_array)

    return string


def remove_last_tab(input_string):
    # Split the string on the last tab character
    parts = input_string.rsplit('\t', 1)

    # Join the remaining parts together
    if len(parts) > 1:
        return ''.join(parts)
    else:
        # No tab found, return the original string
        return input_string

In [None]:
import random

def create_dataset(a_value=None, split=False, train_ratio=0.8):
  header = "_H:\t$Name\t"

  for i in range(0, 3):
    for j in range(0, 1):
      for k in range(0, 3):
        for l in range(0, 10):
          header += f"%Input[4:{i},{j},{k},{l}]"
          if i == j == k == l == 0:
            header += "<4:3,1,3,10>"
          header += "\t"

  for i in range(0, 3):
    for j in range(0, 10):
      header += f"%Output[2:{i},{j}]"
      if i == j == 0:
        header += "<2:3,10>"
      header += "\t"

  header = header.rstrip('\t')  # Remove the last tab

  rows = []

  if a_value is not None:
    # if a_value is provided, only generate addition table
    for b in range(0, 10):
      row = "_D:\t"
      row += f"{a_value}+{b}\t"
      # Input
      row += embeddings[a_value] + "\t" + embeddings[b] + "\t" + number_to_binary_array(1, 30) + "\t"
      # Output
      row += embeddings[a_value + b]
      rows.append(row)
  else:
    # do addition and multiplication if no a_value is provided
    for a in range(0, 10):
      for b in range(0, 10):
        row = "_D:\t"
        row += f"{a}+{b}\t"
        # Input
        row += embeddings[a] + "\t" + embeddings[b] + "\t" + number_to_binary_array(1, 30) + "\t"
        # Output
        row += embeddings[a + b]
        rows.append(row)
    for a in range(0, 10):
      for b in range(0, 10):
        row = "_D:\t"
        row += f"{a}*{b}\t"
        # Input
        row += embeddings[a] + "\t" + embeddings[b] + "\t" + number_to_binary_array(1, 30) + "\t"
        # Output
        row += embeddings[a * b]
        rows.append(row)

  # Optionally split the data
  if split:
    random.shuffle(rows)
    split_index = int(len(rows) * train_ratio)
    train_data = rows[:split_index]
    test_data = rows[split_index:]
    return header + "\n" + "\n".join(train_data), header + "\n" + "\n".join(test_data)

  return header + "\n" + "\n".join(rows)

In [None]:
import os

with open(f"full-dataset.tsv", "w") as file:
  file.write(create_dataset())

with open(f"train-dataset-1.tsv", "w") as file:
  file.write(create_dataset(a_value=1))

with open(f"train-dataset-2.tsv", "w") as file:
  file.write(create_dataset(a_value=2))

train, test = create_dataset(split=True)

with open(f"train-dataset.tsv", "w") as file:
  file.write(train)
with open(f"test-dataset.tsv", "w") as file:
  file.write(test)