In [15]:
# =========================
# GloVe Embedding Model
# =========================

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from itertools import product

# 1. Hyperparameters
embedding_dim = 10
x_max = 100  # cutoff in weighting function
alpha = 0.75  # exponent in weighting function
epochs = 25
lr = 0.01


In [16]:
# 2. Toy Corpus (~20 sentences)
corpus = [
    "the cat sat down",
    "the cat ate food",
    "the dog sat down",
    "the dog ate food",
    "a cat chased a mouse",
    "the dog chased the cat",
    "a dog barked loudly",
    "the cat meowed softly",
    "the bird sang sweetly",
    "a bird flew away",
    "the fish swam fast",
    "a fish jumped high",
    "the boy played ball",
    "the girl sang song",
    "a boy read book",
    "a girl wrote letter",
    "the sun shines bright",
    "the moon glows softly",
    "the stars twinkle bright",
    "a cat slept quietly"
]

# 3. Vocabulary
tokens = sorted(list(set(" ".join(corpus).split())))
word2idx = {w: i for i, w in enumerate(tokens)}
idx2word = {i: w for w, i in word2idx.items()}
V = len(tokens)


In [17]:

# 4. Co-occurrence Matrix
def build_cooccurrence_matrix(corpus, vocab_size, word2idx, window_size=2):
    cooccurrence = np.zeros((vocab_size, vocab_size), dtype=np.float32)
    for sentence in corpus:
        words = sentence.split()
        for i, word in enumerate(words):
            target_idx = word2idx[word]
            for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
                if i != j:
                    context_idx = word2idx[words[j]]
                    distance = abs(i - j)
                    cooccurrence[target_idx, context_idx] += 1.0 / distance  # Decay based on distance                    
    return cooccurrence

cooccurrence_matrix = build_cooccurrence_matrix(corpus, V, word2idx)


In [18]:

# 5. GloVe Model
class GloVe(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(GloVe, self).__init__()
        self.w_embed = nn.Embedding(vocab_size, embedding_dim)  # Word embeddings
        self.c_embed = nn.Embedding(vocab_size, embedding_dim)  # Context embeddings
        self.w_bias = nn.Embedding(vocab_size, 1)  # Word biases
        self.c_bias = nn.Embedding(vocab_size, 1)  # Context biases
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.w_embed.weight)
        nn.init.xavier_uniform_(self.c_embed.weight)
        nn.init.zeros_(self.w_bias.weight)
        nn.init.zeros_(self.c_bias.weight)

    def forward(self, i, j, cooccurrence):
        w_i = self.w_embed(i)  # [B, D]
        w_j = self.c_embed(j)  # [B, D]
        b_i = self.w_bias(i).squeeze()  # [B]
        b_j = self.c_bias(j).squeeze()  # [B]
        x_ij = torch.sum(w_i * w_j, dim=1) + b_i + b_j  # [B]
        weights = torch.clamp(cooccurrence / x_max, max=1.0) ** alpha  # [B]
        loss = weights * (x_ij - torch.log(cooccurrence + 1e-10)) ** 2  # [B]
        return torch.mean(loss)
    
    def get_embeddings(self, word2idx, idx2word):
            """
            Returns the embeddings as a dictionary mapping words to their embeddings.
            """
            combined_embeddings = self.w_embed.weight.data.cpu().numpy() + self.c_embed.weight.data.cpu().numpy()
            embeddings_dict = {idx2word[i]: combined_embeddings[i] for i in range(len(idx2word))}
            return embeddings_dict    
    
    

model = GloVe(V, embedding_dim)
# Calculate and print the total number of parameters in the model
total_params = sum(param.numel() for param in model.parameters())
print(f"Total number of parameters in the GloVe model: {total_params}")


Total number of parameters in the GloVe model: 924


In [19]:

# 6. Training Loop
optimizer = optim.Adam(model.parameters(), lr=lr)


# Convert co-occurrence matrix to sparse format for training
i_indices, j_indices = np.nonzero(cooccurrence_matrix)
cooccurrences = cooccurrence_matrix[i_indices, j_indices]
i_indices = torch.tensor(i_indices, dtype=torch.long)
j_indices = torch.tensor(j_indices, dtype=torch.long)
cooccurrences = torch.tensor(cooccurrences, dtype=torch.float)

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    loss = model(i_indices, j_indices, cooccurrences)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/25, Loss: 0.0085
Epoch 2/25, Loss: 0.0079
Epoch 3/25, Loss: 0.0073
Epoch 4/25, Loss: 0.0068
Epoch 5/25, Loss: 0.0063
Epoch 6/25, Loss: 0.0058
Epoch 7/25, Loss: 0.0054
Epoch 8/25, Loss: 0.0050
Epoch 9/25, Loss: 0.0046
Epoch 10/25, Loss: 0.0043
Epoch 11/25, Loss: 0.0039
Epoch 12/25, Loss: 0.0036
Epoch 13/25, Loss: 0.0033
Epoch 14/25, Loss: 0.0030
Epoch 15/25, Loss: 0.0028
Epoch 16/25, Loss: 0.0025
Epoch 17/25, Loss: 0.0022
Epoch 18/25, Loss: 0.0020
Epoch 19/25, Loss: 0.0018
Epoch 20/25, Loss: 0.0016
Epoch 21/25, Loss: 0.0014
Epoch 22/25, Loss: 0.0012
Epoch 23/25, Loss: 0.0010
Epoch 24/25, Loss: 0.0009
Epoch 25/25, Loss: 0.0007


In [20]:
# Get the embeddings as a dictionary
embeddings = model.get_embeddings(word2idx, idx2word)

# Example: Find top similar words for "cat"
from sklearn.metrics.pairwise import cosine_similarity

def find_top_similar_words(word, embeddings, top_n=5):
    if word not in embeddings:
        print(f"Word '{word}' not in vocabulary.")
        return
    word_vec = embeddings[word].reshape(1, -1)
    all_words = list(embeddings.keys())
    all_vectors = np.array([embeddings[w] for w in all_words])
    similarities = cosine_similarity(word_vec, all_vectors).flatten()
    similar_indices = similarities.argsort()[::-1][1:top_n + 1]  # Skip the word itself
    similar_words = [all_words[i] for i in similar_indices]
    print(f"Top {top_n} words similar to '{word}': {', '.join(similar_words)}")

# Example: Find top similar words for a few words in the vocabulary
example_words = ["cat", "dog", "bird", "fish", "boy"]
for word in example_words:
    find_top_similar_words(word, embeddings,top_n=5)

Top 5 words similar to 'cat': the, away, a, dog, high
Top 5 words similar to 'dog': the, away, letter, cat, jumped
Top 5 words similar to 'bird': softly, played, quietly, sun, meowed
Top 5 words similar to 'fish': book, wrote, barked, slept, jumped
Top 5 words similar to 'boy': sun, meowed, shines, stars, read
