In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Word2Vec Implementation
# Import necessary libraries

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from nltk.tokenize import word_tokenize

In [None]:
# Define the Word2Vec model class

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, target_word, context_word):
        target_embed = self.in_embed(target_word)
        context_embed = self.out_embed(context_word)
        return target_embed, context_embed

In [None]:
class Vocabulary:
    def __init__(self, words):
        self.vocab = list(set(words))

        self.stoi = {v:k for k, v in enumerate(self.vocab)}
        self.itos = {k:v for k, v in enumerate(self.vocab)}

    def __len__(self):
        return len(self.stoi)

In [None]:
# Define the training function

def train_word2vec(corpus, window_size, embedding_dim, num_epochs, learning_rate):
    # Preprocess the corpus and build the vocabulary
    tokens = word_tokenize(corpus)
    v = Vocabulary(tokens)

    training_pairs = []
    # Create the target-context word pairs
    for t in range(len(tokens)):
        if tokens[t] == '.' or tokens[t] == '!':
            continue

        for c in range(t-window_size//2, t+1+window_size//2):
            if c == t or c < 0 or c >= len(tokens) or tokens[c] == '.' or tokens[c] == '!':
                continue

            target = tokens[t]
            context = tokens[c]
            training_pairs.append((torch.tensor(v.stoi[target]), torch.tensor(v.stoi[context])))

    # Initialize the Word2Vec model
    model = Word2Vec(len(v), embedding_dim)

    # Define the loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0.0
        for target_word, context_word in training_pairs:
            # Zero the gradients
            optimizer.zero_grad()
            # Forward pass
            t, c = model(target_word, context_word)
            # Compute the loss
            loss = loss_fn(t, c)
            # Backward pass
            loss.backward()
            # Update the model parameters
            optimizer.step()
            # Accumulate the loss
            total_loss += loss.item()

        # Print the average loss for the epoch
        print(f"Epoch {epoch+1} Loss: {total_loss/len(training_pairs):.3f}")

    # Return the trained Word2Vec model and vocab
    return (v, model)

In [None]:
def k_most_similar(goal, embeddings, stoi, k):

    scores = []
    goal_embed = embeddings[stoi[goal]]
    for (w, idx) in stoi.items():
        if w == goal:
            continue

        s = np.dot(goal_embed, embeddings[idx]) / (np.linalg.norm(goal_embed, 2) * np.linalg.norm(embeddings[idx], 2))
        scores.append((w, s))

    sort_scores = sorted(scores, key=lambda i: i[1], reverse=True)
    return sort_scores[:k]

In [None]:
# Define the main function

def main():
    # Set hyperparameters
    corpus = "I love to learn deep learning. It is fascinating!"
    window_size = 3
    embedding_dim = 10

    LR = 1e-2
    EPOCHS = 50

    # Train the Word2Vec model
    vocab, model = train_word2vec(corpus, window_size, embedding_dim, EPOCHS, LR)
    embeddings = model.in_embed.weight.detach().numpy()

    # Evaluate the trained model using word similarity or analogy tasks
    x = k_most_similar("learn", embeddings, vocab.stoi, 3)
    print("3 most similar words to 'learn':", x)

    x = k_most_similar("deep", embeddings, vocab.stoi, 3)
    print("3 most similar words to 'deep':", x)

    print()
    # Print the learned word embeddings
    for (w, idx) in vocab.stoi.items():
        print(w, embeddings[idx])

    # Save the trained model
    torch.save(model.state_dict(), './word2vec')

# Run the main function
if __name__ == "__main__":
    main()


Epoch 1 Loss: -3.915
Epoch 2 Loss: -5.597
Epoch 3 Loss: -7.172
Epoch 4 Loss: -8.753
Epoch 5 Loss: -10.352
Epoch 6 Loss: -11.975
Epoch 7 Loss: -13.629
Epoch 8 Loss: -15.318
Epoch 9 Loss: -17.049
Epoch 10 Loss: -18.828
Epoch 11 Loss: -20.661
Epoch 12 Loss: -22.554
Epoch 13 Loss: -24.515
Epoch 14 Loss: -26.549
Epoch 15 Loss: -28.662
Epoch 16 Loss: -30.861
Epoch 17 Loss: -33.151
Epoch 18 Loss: -35.539
Epoch 19 Loss: -38.028
Epoch 20 Loss: -40.626
Epoch 21 Loss: -43.336
Epoch 22 Loss: -46.163
Epoch 23 Loss: -49.114
Epoch 24 Loss: -52.191
Epoch 25 Loss: -55.401
Epoch 26 Loss: -58.745
Epoch 27 Loss: -62.228
Epoch 28 Loss: -65.853
Epoch 29 Loss: -69.621
Epoch 30 Loss: -73.536
Epoch 31 Loss: -77.597
Epoch 32 Loss: -81.806
Epoch 33 Loss: -86.164
Epoch 34 Loss: -90.671
Epoch 35 Loss: -95.328
Epoch 36 Loss: -100.134
Epoch 37 Loss: -105.089
Epoch 38 Loss: -110.194
Epoch 39 Loss: -115.448
Epoch 40 Loss: -120.851
Epoch 41 Loss: -126.403
Epoch 42 Loss: -132.104
Epoch 43 Loss: -137.955
Epoch 44 Loss: -