In [1]:
import torch
import torch.nn as nn

In [2]:
v = 4
d = 2
E = torch.randn(v,d, requires_grad=True)
E

tensor([[-1.5891, -0.2638],
        [ 0.1915, -1.1254],
        [-0.3462, -0.9490],
        [-0.2087,  0.7255]], requires_grad=True)

In [3]:
def triangular(n):
    return n*(n+1)//2

In [4]:
v2 = triangular(v)
print(v2)

10


In [5]:
E2 = torch.zeros((int(v2), d), requires_grad=False)
E2

tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])

In [6]:
j = 0
for i, vec in enumerate(E):
    print(f"i: {i}")
    print(f"j: {j}")
    print(f"vec: {vec}")
    print(f"expanded vec: {vec.expand(v-i,d).shape}\n{vec.expand(v-i,d)}")
    print(f"spliced 2nd matrix: {E[i:,:].shape}\n{E[i:,:]}")
    print(f"[{j}:{j+v-i},:]")
    E2[j:j+v-i,:] += E[i,:].expand(v-i,d) + E[i:,:]
    print(f"E2: {E2.shape}\n{E2}")
    j += v-i

i: 0
j: 0
vec: tensor([-1.5891, -0.2638], grad_fn=<UnbindBackward0>)
expanded vec: torch.Size([4, 2])
tensor([[-1.5891, -0.2638],
        [-1.5891, -0.2638],
        [-1.5891, -0.2638],
        [-1.5891, -0.2638]], grad_fn=<ExpandBackward0>)
spliced 2nd matrix: torch.Size([4, 2])
tensor([[-1.5891, -0.2638],
        [ 0.1915, -1.1254],
        [-0.3462, -0.9490],
        [-0.2087,  0.7255]], grad_fn=<SliceBackward0>)
[0:4,:]
E2: torch.Size([10, 2])
tensor([[-3.1781, -0.5276],
        [-1.3976, -1.3892],
        [-1.9353, -1.2127],
        [-1.7978,  0.4617],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000]], grad_fn=<CopySlices>)
i: 1
j: 4
vec: tensor([ 0.1915, -1.1254], grad_fn=<UnbindBackward0>)
expanded vec: torch.Size([3, 2])
tensor([[ 0.1915, -1.1254],
        [ 0.1915, -1.1254],
        [ 0.1915, -1.1254]], grad_fn=<ExpandBackward0>)
spliced 2nd matrix: torch.Size(

In [7]:
def combination_embedding(E):
    v, d = E.shape
    E2 = torch.zeros((v*(v+1)//2, d), requires_grad=False)
    j = 0
    for i, vec in enumerate(E):
        E2[j:j+v-i,:] += E[i,:].expand(v-i,d) + E[i:,:]
        j += v-i

    return E2

In [8]:
test = torch.randn(65,4)
print(test)
E2 = combination_embedding(test)
print(E2.shape[0])

tensor([[ 0.1466, -1.3016, -0.5108, -1.2226],
        [-0.6532,  0.5549,  1.5002,  1.1963],
        [ 1.8579,  0.4681, -0.5286,  0.6174],
        [-0.2927,  0.8926,  0.9477,  0.5534],
        [-1.7084, -0.5552, -1.0848, -0.7154],
        [-0.1111, -0.8939,  0.8378, -1.3440],
        [ 0.1130, -0.8787,  0.3059,  0.1857],
        [-0.1648,  0.7757,  0.7516, -0.4718],
        [ 0.3098,  2.3650,  0.7328, -1.1582],
        [ 0.2621,  0.4907,  0.7943, -1.1838],
        [ 0.3898,  0.0448,  1.4809,  2.1894],
        [-1.1570, -0.5844,  1.6015, -1.2931],
        [-1.3966, -0.2929, -2.1745,  0.3359],
        [-0.7169,  1.2680, -0.0851, -0.9281],
        [ 0.6049, -1.2090, -0.1902,  1.7755],
        [ 0.2276, -0.4579,  0.6435, -0.9937],
        [ 0.4440,  0.4491, -0.7782, -0.4786],
        [ 1.3719,  0.0576, -0.2437, -0.9158],
        [ 0.6693,  0.2217, -1.1978, -2.0264],
        [-0.5351,  0.1084,  1.2621,  2.3545],
        [-1.6203, -0.4430,  0.4694,  2.3911],
        [-0.1002, -0.9977, -0.3544

In [9]:
E3 = combination_embedding(E2)
print(E3.shape[0])

2301585


I'm thinking maybe i can start off with some absurdly huge supertoken setup made from bytes and then use a method to trim it down to more interesting token combinations like in that one paper i just read [tokenization is more than compression](https://arxiv.org/pdf/2402.18376.pdf) to make the size actually manageable

ok let's try a different idea i had recently that kinda brings us back to NCP. let's say at the bottom level we have some list of byte or token or wahtever embedding vectors. at a higher level model let's do a kind of mechanism where we grab the topk results from multiplying by the embedding matrix, then take those topk embedding vectors and add & RMSNorm them together, then temporarily concatenate those new options to the end of our embedding matrix, then multiply by the embedding again and see if any of our new results are better.

In [2]:
from torch.nn import functional as F
v = 8
d = 4
E = F.layer_norm(torch.randn(v,d), normalized_shape=(v,d), weight=None, bias=None) # layernorm is safe as RMSNorm for our purposes rn
E

tensor([[-1.3421,  0.4649, -0.0076, -0.7923],
        [ 0.1688,  1.9326,  0.1201, -0.2860],
        [ 0.8903,  0.4975, -1.1466,  0.3244],
        [-0.1451,  0.9796, -1.3276,  2.2594],
        [ 0.9164,  0.4881,  1.0235, -0.0794],
        [-0.7313, -1.4907, -0.9651, -1.4982],
        [ 0.2095, -1.0771,  1.6429, -0.5998],
        [-1.0176, -0.6422,  0.0439,  1.1870]])

In [3]:
x = F.layer_norm(torch.randn(1,1,d), normalized_shape=(1,1,d), weight=None, bias=None)
x

tensor([[[ 0.4521, -1.2818,  1.3657, -0.5361]]])

In [4]:
z = x @ E.t()
z

tensor([[[-0.7883, -2.0835, -1.9749, -4.3456,  1.2290,  1.0652,  4.0406,
          -0.2133]]])

In [6]:
k = 3
top = torch.topk(z, k).indices
top

tensor([[[6, 4, 5]]])

In [7]:
def triangular(n):
    return n * (n+1) // 2

In [8]:
E_temp = torch.zeros((v+triangular(k-1), d))
E_temp

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [9]:
E_temp[:v,:] += E
E_temp

tensor([[-1.3421,  0.4649, -0.0076, -0.7923],
        [ 0.1688,  1.9326,  0.1201, -0.2860],
        [ 0.8903,  0.4975, -1.1466,  0.3244],
        [-0.1451,  0.9796, -1.3276,  2.2594],
        [ 0.9164,  0.4881,  1.0235, -0.0794],
        [-0.7313, -1.4907, -0.9651, -1.4982],
        [ 0.2095, -1.0771,  1.6429, -0.5998],
        [-1.0176, -0.6422,  0.0439,  1.1870],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000]])

In [13]:
def create_tensor_x_no_self_addition(E, indices):
    k = len(indices)
    d = E.size(1)
    X_size = (k - 1) * k // 2
    X = torch.empty((X_size, d), dtype=E.dtype)

    count = 0
    for i in range(k):
        for j in range(i + 1, k):
            X[count] = E[indices[i]] + E[indices[j]]
            count += 1

    return X

# Now you can use this function to create your tensor X without self additions
E_comb = create_tensor_x_no_self_addition(E, top.squeeze(0).squeeze(0))
E_comb

tensor([[ 1.1260, -0.5890,  2.6663, -0.6793],
        [-0.5217, -2.5678,  0.6778, -2.0981],
        [ 0.1852, -1.0026,  0.0583, -1.5776]])

In [14]:
E_temp[v:,:] += E_comb
E_temp

tensor([[-1.3421,  0.4649, -0.0076, -0.7923],
        [ 0.1688,  1.9326,  0.1201, -0.2860],
        [ 0.8903,  0.4975, -1.1466,  0.3244],
        [-0.1451,  0.9796, -1.3276,  2.2594],
        [ 0.9164,  0.4881,  1.0235, -0.0794],
        [-0.7313, -1.4907, -0.9651, -1.4982],
        [ 0.2095, -1.0771,  1.6429, -0.5998],
        [-1.0176, -0.6422,  0.0439,  1.1870],
        [ 1.1260, -0.5890,  2.6663, -0.6793],
        [-0.5217, -2.5678,  0.6778, -2.0981],
        [ 0.1852, -1.0026,  0.0583, -1.5776]])

In [15]:
z_ = x @ E_temp.t()
z_

tensor([[[-0.7883, -2.0835, -1.9749, -4.3456,  1.2290,  1.0652,  4.0406,
          -0.2133,  5.2696,  5.1058,  2.2942]]])

yoooo this is cool so now we have a way of checking against combination vectors without having to keep some absurdly large tensor in memory.

so the question is, do we only ever keep the lowest level in memory and then our combinations are only ever from that base level? Or do we let new vectors that get used actually be added to that level's matrix so that it can dynamically grow over time in a kind of byte-pair tokenization style? idk