In [2]:
# my virtual environments are rarely properly connected to jupyter so this fixes that
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

# Importing pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

import random as r

In [3]:
v, d = 128, 256
embedder = nn.Embedding(v, d)
embedder.weight

Parameter containing:
tensor([[-0.8073,  1.1433, -0.1321,  ..., -2.6458, -0.6629, -0.1587],
        [ 0.7613,  0.9070,  0.8021,  ...,  0.4444,  0.9084,  1.0139],
        [ 0.1461, -1.1294,  0.2314,  ...,  0.1229, -1.9200,  1.8613],
        ...,
        [ 0.5094,  0.4575, -1.6483,  ...,  0.5640, -0.2304,  0.1097],
        [ 0.3923, -0.0513,  1.3227,  ..., -0.4111, -1.8965,  1.1194],
        [-0.1266,  1.0371,  0.1028,  ..., -0.8041,  1.1302,  0.3114]],
       requires_grad=True)

In [4]:
class Norm(torch.nn.Module):
    def __init__(self, embed_dim: int, norm_type: str = "cos", eps: float = 1e-6, norm_affine: bool = False, dropout_rate: float = 0.1):
        super().__init__()
        self.eps = eps
        self.affine = norm_affine
        self.dropout_rate = dropout_rate
        self.norm_type = norm_type

        # Initialize weight and bias parameters for affine transformation
        # We start with ones for weight to keep the original scale initially, and zeros for bias.
        self.w = nn.Parameter(torch.ones(embed_dim))
        self.b = nn.Parameter(torch.zeros(embed_dim))

        self.logging_enabled = False
    def enable_logging(self):
        self.logging_enabled = True
    def disable_logging(self):
        self.logging_enabled = False

    def CosineNorm(self, x):
        # normalize x by dividing by its L2 norm along the last dimension.
        # this places x on the unit hypersphere centered at the origin
        # Add a small constant to the denominator to avoid division by zero.
        return x / torch.norm(x, p=2, dim=-1, keepdim=True).clamp(min=self.eps)

    def RMSNorm(self, x):
        # normalize x by dividing by its root-mean-square along the last dimension
        # this places x on a hypersphere of radius sqrt(dimension) with no certain center
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def LayerNorm(self, x):
        # normalize x by subtracting by its mean then dividing by its variance
        # this places x on a hypersphere of radius sqrt(dimension) centered at the origin
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        return (x - mean) / torch.sqrt(var + self.eps)

    def splice_affine(self, weight, bias, d_i):
        return weight[:d_i], bias[:d_i]

    def forward(self, x: torch.Tensor, training: bool = False) -> torch.Tensor:
        # Normalize the input tensor
        if self.norm_type == "CosineNorm":
            x = self.CosineNorm(x)
        elif self.norm_type == "LayerNorm":
            x = self.LayerNorm(x)
        else: # defaults to RMSNorm bc that's the most commonly used nowadays
            x = self.RMSNorm(x)

        # Optionally apply the affine transformation with splicing
        if self.affine:
            w, b = self.splice_affine(self.w, self.b, x.shape[-1])
            x = x * w + b
            x = F.dropout(x, p=self.dropout_rate, training=training) # and dropout if we're training

        return x

In [5]:
# prompt: grab 2 random vectors from the embed and sum them

# Get two random indices within the vocabulary size
index1 = r.randint(0, v - 1)
index2 = r.randint(0, v - 1)

# Extract the corresponding vectors from the embedding layer
vector1 = embedder.weight[index1]
vector2 = embedder.weight[index2]

# create some noise
noise_sd = 0.3
noise = torch.randn(d) * noise_sd

# Sum the two vectors & add some noise
concept = vector1 + vector2 + noise


In [6]:
# prompt: Now initialize a `Norm` and use it on our summed vector

# Initialize a Norm layer with the same embedding dimension as the vectors
norm_layer = Norm(d)

# Apply the normalization to the sum vector
normalized_concept = norm_layer(concept)


In [7]:
# prompt: Initialize an nn.CosineSimilarity(dim=-1, eps=1e-6) and use it to compare normalized_concept to all the vectors in embedder

cosine_similarity = nn.CosineSimilarity(dim=-1, eps=1e-6)
similarity_scores = cosine_similarity(normalized_concept, embedder.weight)
similarity_scores

tensor([ 7.1494e-03,  1.1527e-02,  1.3576e-02,  3.2820e-02, -3.9479e-02,
        -9.5474e-02, -8.2890e-02,  1.5776e-02,  6.4283e-02, -2.1337e-02,
         5.5857e-03,  1.2009e-02,  9.4453e-02,  5.1899e-02, -2.5338e-02,
        -1.2715e-01,  8.1137e-02,  8.3458e-02, -7.1376e-03,  2.5764e-02,
         4.1137e-02,  7.3916e-02,  5.8518e-02, -9.9237e-03,  1.2152e-02,
        -2.6197e-03, -2.1903e-02,  1.9650e-02,  2.9031e-02,  8.6118e-03,
        -1.0277e-01, -7.3730e-02, -8.8745e-03, -4.1543e-02,  1.1397e-02,
         8.2652e-02, -7.3531e-02, -2.1971e-02,  7.2303e-01, -7.3195e-02,
         2.9334e-02,  4.9980e-02,  8.1096e-02,  3.1988e-02,  6.0336e-02,
         4.2632e-03,  2.0838e-02,  2.6068e-02,  3.0076e-02, -1.1351e-03,
        -3.0020e-02, -6.4855e-02,  4.4605e-02,  1.4552e-01, -3.7638e-02,
         3.0052e-02,  6.4689e-02, -4.8579e-02,  7.6461e-02,  1.1257e-01,
         3.9186e-04,  1.1170e-01, -1.0726e-01, -5.6641e-03,  9.7479e-02,
        -2.3225e-03, -6.9703e-02,  2.8242e-02,  5.7

In [8]:
# prompt: Grab the topk of those similarity scores with k=8 and print them for me to see

topk_values, topk_indices = torch.topk(similarity_scores, k=8)
print(topk_values)
print(topk_indices)


tensor([0.7230, 0.6540, 0.1558, 0.1455, 0.1126, 0.1117, 0.0976, 0.0975],
       grad_fn=<TopkBackward0>)
tensor([ 38, 124,  98,  53,  59,  61,  84,  64])


In [9]:
def create_concept_embeddings(E, indices):
    k = len(indices)
    d = E.size(1)
    X_size = (k - 1) * k // 2
    X = torch.empty((X_size, d), dtype=E.dtype)

    count = 0
    for i in range(k):
        for j in range(i + 1, k):
            X[count] = E[indices[i]] + E[indices[j]]
            count += 1

    return X

In [10]:
E_comb = create_concept_embeddings(embedder.weight, topk_indices)
E_comb.shape, E_comb

(torch.Size([28, 256]),
 tensor([[-1.4138,  3.0280,  2.0697,  ..., -0.2237, -1.1991,  1.5292],
         [-1.4550,  1.3894,  1.6838,  ..., -0.1591, -2.7218,  1.9800],
         [ 0.5843,  3.4711,  1.2170,  ..., -0.6952, -1.3263,  3.2264],
         ...,
         [-1.7019, -3.3933, -0.7097,  ...,  1.0141, -0.4978,  0.9503],
         [-4.4751, -3.0278,  0.1329,  ..., -0.7627, -0.7052,  1.9806],
         [-3.1139, -1.1019,  1.3278,  ..., -0.2597, -1.0889,  0.2060]],
        grad_fn=<CopySlices>))

In [11]:
degree2_similarity_scores = cosine_similarity(normalized_concept, E_comb)
degree2_similarity_scores[0]

tensor(0.9771, grad_fn=<SelectBackward0>)

In [12]:
def concept_matchup(c: torch.Tensor, embedding: torch.Tensor, combo: int, sample: int, greedy: bool = False, temp: float = 1.0):
    assert sample >= combo, 'you need to sample at least as many token embedding vectors as what you plan to combine'
    token_similarities = F.cosine_similarity(c, embedding, dim=-1)
    print(token_similarities.shape, token_similarities)
    topk_token_indices = torch.topk(token_similarities, k=sample).indices
    print(topk_token_indices.shape, topk_token_indices)
    concept_embeddings = create_concept_embeddings(embedding, topk_token_indices)
    print(concept_embeddings.shape, concept_embeddings)
    concept_similarities = F.cosine_similarity(c, concept_embeddings, dim=-1)
    print(concept_similarities.shape, concept_similarities)
    topk_concept_similarities, topk_concept_indices = torch.topk(concept_similarities, k=sample)
    print(topk_concept_similarities.shape, topk_concept_similarities)
    print(topk_concept_indices.shape, topk_concept_indices)
    if greedy:
        concept_idx = int(topk_concept_indices[0])
        print(concept_idx)
        return concept_embeddings[:,concept_idx]
    else:
        topk_concept_probs = F.softmax(topk_concept_similarities / temp, dim=-1)
        print(topk_concept_probs.shape, topk_concept_probs)
        concept_topk_idx = torch.multinomial(topk_concept_probs, num_samples = 1)
        print(concept_topk_idx)
        concept_idx = topk_concept_indices[concept_topk_idx]
        print(concept_idx)
        return concept_embeddings[concept_idx]


In [13]:
matched_concept = concept_matchup(concept, embedder.weight, 4, 8, greedy=False, temp=0.2)
matched_concept

torch.Size([128]) tensor([ 7.1494e-03,  1.1527e-02,  1.3576e-02,  3.2820e-02, -3.9479e-02,
        -9.5474e-02, -8.2890e-02,  1.5776e-02,  6.4283e-02, -2.1337e-02,
         5.5857e-03,  1.2009e-02,  9.4453e-02,  5.1899e-02, -2.5338e-02,
        -1.2715e-01,  8.1137e-02,  8.3458e-02, -7.1376e-03,  2.5764e-02,
         4.1137e-02,  7.3916e-02,  5.8518e-02, -9.9237e-03,  1.2152e-02,
        -2.6197e-03, -2.1903e-02,  1.9650e-02,  2.9031e-02,  8.6118e-03,
        -1.0277e-01, -7.3730e-02, -8.8745e-03, -4.1543e-02,  1.1397e-02,
         8.2652e-02, -7.3531e-02, -2.1971e-02,  7.2303e-01, -7.3195e-02,
         2.9334e-02,  4.9980e-02,  8.1096e-02,  3.1988e-02,  6.0336e-02,
         4.2632e-03,  2.0838e-02,  2.6068e-02,  3.0076e-02, -1.1350e-03,
        -3.0020e-02, -6.4855e-02,  4.4605e-02,  1.4552e-01, -3.7638e-02,
         3.0052e-02,  6.4689e-02, -4.8579e-02,  7.6461e-02,  1.1257e-01,
         3.9186e-04,  1.1170e-01, -1.0726e-01, -5.6641e-03,  9.7479e-02,
        -2.3225e-03, -6.9703e-02,

tensor([[-2.2043e+00, -1.0837e+00,  1.8537e-01,  2.1764e+00,  1.4846e+00,
          4.0453e+00,  3.8345e-02,  1.1215e+00, -1.3536e+00,  2.1843e+00,
         -9.8436e-01,  1.0102e+00,  2.1251e+00,  1.9812e+00, -6.1911e-01,
         -1.1599e+00, -5.7994e-01,  3.4874e-02,  8.1136e-01, -6.5791e-01,
         -1.3594e+00,  2.1432e+00,  2.1755e+00,  1.2292e+00, -1.8886e-03,
         -6.1713e-01,  3.9019e-01, -5.6896e-01, -1.3851e-01,  1.0843e+00,
         -2.6903e-01,  1.9566e-01, -1.9897e-01,  1.1603e+00, -4.8639e+00,
         -1.6356e-01, -3.4877e+00, -1.8800e+00,  2.3542e+00, -1.1411e+00,
         -1.1248e+00,  8.9675e-01, -1.6131e+00, -9.8432e-01, -1.4025e+00,
          7.7121e-01,  1.3408e+00, -2.4267e+00,  7.6668e-01,  1.5692e-01,
         -6.8639e-01, -1.0739e-01, -2.8405e+00, -1.4735e-01,  2.2883e+00,
         -4.3994e-02,  1.9398e-02, -1.4309e+00,  5.9651e-01,  6.7744e-01,
          4.4980e-01,  5.7838e-01, -2.4775e+00,  2.1585e+00,  6.7887e-02,
          5.7665e-01, -2.7616e-01, -3.

In [27]:
import torch
import random as r

# Assuming embedder is defined and has an attribute 'weight'
# v is the size of the vocabulary, d is the dimension of the embedding, and b is the batch size

v, d = 128, 256
embedder = nn.Embedding(v, d)
embedder.weight

b = 32  # Batch size

# Get two batches of random indices within the vocabulary size
indices1 = torch.randint(0, v, (b,))
indices2 = torch.randint(0, v, (b,))

# Extract the corresponding vectors from the embedding layer for each index in the batch
vectors1 = embedder.weight[indices1]
vectors2 = embedder.weight[indices2]

# Create some noise for each vector in the batch
noise_sd = 0.3
noise = torch.randn(b, d) * noise_sd

# Sum the two batches of vectors & add some noise to each
concept_batch = vectors1 + vectors2 + noise
concept_batch.shape, concept_batch

(torch.Size([32, 256]),
 tensor([[ 0.3397, -0.6837, -0.0673,  ...,  0.9303,  1.2032,  0.7619],
         [ 1.1240,  0.7185, -1.4923,  ...,  0.6689,  1.1480,  1.1124],
         [-2.9146,  2.2994,  0.1993,  ..., -4.3017,  1.4652,  1.2509],
         ...,
         [ 0.7449, -0.5608,  0.9285,  ..., -2.0668, -0.2718,  0.5154],
         [-1.8308,  3.0527,  1.5382,  ...,  0.1653, -1.6641,  0.5171],
         [-0.1733,  1.5868,  2.0756,  ...,  0.0222, -2.0070,  0.3852]],
        grad_fn=<AddBackward0>))

In [28]:
def concept_matchup(c: torch.Tensor, embedding: torch.Tensor, combo: int, sample: int, greedy: bool = False, temp: float = 1.0):
    assert sample >= combo, 'you need to sample at least as many token embedding vectors as what you plan to combine'
    if len(c.shape) == 2:
        embedding = embedding.unsqueeze(0)
        print(embedding.shape)
        embedding = embedding.expand(c.shape[0], -1, -1)
        print(embedding.shape)
    
    token_similarities = F.cosine_similarity(c, embedding, dim=-1)
    print(token_similarities.shape, token_similarities)
    topk_token_indices = torch.topk(token_similarities, k=sample).indices
    print(topk_token_indices.shape, topk_token_indices)
    concept_embeddings = create_concept_embeddings(embedding, topk_token_indices)
    print(concept_embeddings.shape, concept_embeddings)
    concept_similarities = F.cosine_similarity(c, concept_embeddings, dim=-1)
    print(concept_similarities.shape, concept_similarities)
    topk_concept_similarities, topk_concept_indices = torch.topk(concept_similarities, k=sample)
    print(topk_concept_similarities.shape, topk_concept_similarities)
    print(topk_concept_indices.shape, topk_concept_indices)
    if greedy:
        concept_idx = int(topk_concept_indices[0])
        print(concept_idx)
        return concept_embeddings[:,concept_idx]
    else:
        topk_concept_probs = F.softmax(topk_concept_similarities / temp, dim=-1)
        print(topk_concept_probs.shape, topk_concept_probs)
        concept_topk_idx = torch.multinomial(topk_concept_probs, num_samples = 1)
        print(concept_topk_idx)
        concept_idx = topk_concept_indices[concept_topk_idx]
        print(concept_idx)
        return concept_embeddings[concept_idx]

In [29]:
matched_concepts = concept_matchup(concept_batch, embedder.weight, 4, 8, greedy=False, temp=0.2)
matched_concepts.shape, matched_concepts

torch.Size([1, 128, 256])
torch.Size([32, 128, 256])


RuntimeError: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 1