In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
import re

class Word2Vec(nn.Module):
    """Word2Vec implementation with Skip-gram and CBOW architectures"""

    def __init__(self, vocab_size, embedding_dim, model_type='skipgram'):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.model_type = model_type

        # Input embeddings (center word)
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        # Output embeddings (context words)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

        # Initialize embeddings
        self.in_embed.weight.data.uniform_(-0.5/embedding_dim, 0.5/embedding_dim)
        self.out_embed.weight.data.uniform_(-0.5/embedding_dim, 0.5/embedding_dim)

    def forward(self, center, context, neg_samples):
        """
        Args:
            center: center word indices [batch_size]
            context: context word indices [batch_size]
            neg_samples: negative sample indices [batch_size, num_neg_samples]
        """
        # Get embeddings
        center_embed = self.in_embed(center)  # [batch_size, embed_dim]
        context_embed = self.out_embed(context)  # [batch_size, embed_dim]
        neg_embed = self.out_embed(neg_samples)  # [batch_size, num_neg, embed_dim]

        # Positive score
        pos_score = torch.sum(center_embed * context_embed, dim=1)  # [batch_size]
        pos_score = torch.log(torch.sigmoid(pos_score) + 1e-10)

        # Negative scores
        neg_score = torch.bmm(neg_embed, center_embed.unsqueeze(2)).squeeze(2)  # [batch_size, num_neg]
        neg_score = torch.sum(torch.log(torch.sigmoid(-neg_score) + 1e-10), dim=1)  # [batch_size]

        # Negative sampling loss
        loss = -(pos_score + neg_score).mean()

        return loss


class Word2VecTrainer:
    """Trainer for Word2Vec model"""

    def __init__(self, sentences, embedding_dim=100, window_size=5,
                 min_count=5, num_neg_samples=5, model_type='skipgram'):
        self.sentences = sentences
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.min_count = min_count
        self.num_neg_samples = num_neg_samples
        self.model_type = model_type

        # Build vocabulary
        self.build_vocab()

        # Initialize model
        self.model = Word2Vec(len(self.word2idx), embedding_dim, model_type)

        # Prepare negative sampling table
        self.prepare_neg_sampling()

    def build_vocab(self):
        """Build vocabulary from sentences"""
        word_counts = Counter()
        for sentence in self.sentences:
            word_counts.update(sentence.lower().split())

        # Filter by min_count
        self.vocab = [word for word, count in word_counts.items()
                      if count >= self.min_count]

        # Create mappings
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

        # Store word frequencies for negative sampling
        self.word_freq = np.array([word_counts[word] for word in self.vocab])

        print(f"Vocabulary size: {len(self.vocab)}")

    def prepare_neg_sampling(self):
        """Prepare negative sampling distribution (raised to 3/4 power)"""
        freq = self.word_freq ** 0.75
        self.neg_sampling_probs = freq / freq.sum()
        self.neg_sampling_table = np.random.choice(
            len(self.vocab),
            size=1000000,
            p=self.neg_sampling_probs
        )
        self.neg_sample_idx = 0

    def get_negative_samples(self, batch_size):
        """Sample negative examples"""
        samples = []
        for _ in range(batch_size):
            neg = []
            for _ in range(self.num_neg_samples):
                neg.append(self.neg_sampling_table[self.neg_sample_idx])
                self.neg_sample_idx = (self.neg_sample_idx + 1) % len(self.neg_sampling_table)
            samples.append(neg)
        return torch.LongTensor(samples)

    def generate_training_data(self):
        """Generate training pairs for Skip-gram"""
        pairs = []

        for sentence in self.sentences:
            words = sentence.lower().split()
            indices = [self.word2idx[w] for w in words if w in self.word2idx]

            for i, center in enumerate(indices):
                # Get context words within window
                start = max(0, i - self.window_size)
                end = min(len(indices), i + self.window_size + 1)

                for j in range(start, end):
                    if i != j:
                        context = indices[j]
                        pairs.append((center, context))

        return pairs

    def train(self, epochs=5, batch_size=128, lr=0.025):
        """Train the Word2Vec model"""
        optimizer = optim.Adam(self.model.parameters(), lr=lr)

        # Generate training data
        print("Generating training data...")
        training_pairs = self.generate_training_data()
        print(f"Training pairs: {len(training_pairs)}")

        self.model.train()

        for epoch in range(epochs):
            total_loss = 0
            np.random.shuffle(training_pairs)

            for i in range(0, len(training_pairs), batch_size):
                batch = training_pairs[i:i+batch_size]

                centers = torch.LongTensor([p[0] for p in batch])
                contexts = torch.LongTensor([p[1] for p in batch])
                neg_samples = self.get_negative_samples(len(batch))

                optimizer.zero_grad()
                loss = self.model(centers, contexts, neg_samples)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            avg_loss = total_loss / (len(training_pairs) / batch_size)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

    def get_word_vector(self, word):
        """Get embedding vector for a word"""
        if word not in self.word2idx:
            return None
        idx = self.word2idx[word]
        return self.model.in_embed.weight[idx].detach().numpy()

    def most_similar(self, word, top_k=5):
        """Find most similar words using cosine similarity"""
        if word not in self.word2idx:
            return []

        word_vec = self.get_word_vector(word)
        word_vec = word_vec / np.linalg.norm(word_vec)

        # Compute cosine similarity with all words
        all_vecs = self.model.in_embed.weight.detach().numpy()
        all_vecs = all_vecs / np.linalg.norm(all_vecs, axis=1, keepdims=True)

        similarities = np.dot(all_vecs, word_vec)

        # Get top-k most similar (excluding the word itself)
        top_indices = np.argsort(similarities)[::-1][1:top_k+1]

        results = [(self.idx2word[idx], similarities[idx]) for idx in top_indices]
        return results


# Example usage
if __name__ == "__main__":
    # Sample corpus
    corpus = [
        "the quick brown fox jumps over the lazy dog",
        "the dog is lazy and sleeps all day",
        "the fox is quick and clever",
        "a quick brown dog jumps high",
        "the lazy cat sleeps on the mat",
        "quick movements scare the lazy animals",
        "brown and white dogs play together",
        "the clever fox outsmarts the dog",
        "lazy afternoon with sleeping animals",
        "quick thinking saves the day"
    ] * 100  # Repeat for more training data

    # Initialize and train
    print("Initializing Word2Vec trainer...")
    trainer = Word2VecTrainer(
        sentences=corpus,
        embedding_dim=50,
        window_size=3,
        min_count=1,
        num_neg_samples=5
    )

    print("\nTraining Word2Vec model...")
    trainer.train(epochs=10, batch_size=32, lr=0.01)

    # Test similarity
    print("\n" + "="*50)
    print("Testing word similarities:")
    print("="*50)

    test_words = ['quick', 'lazy', 'dog', 'fox']
    for word in test_words:
        print(f"\nMost similar to '{word}':")
        similar = trainer.most_similar(word, top_k=3)
        for sim_word, score in similar:
            print(f"  {sim_word}: {score:.4f}")

Initializing Word2Vec trainer...
Vocabulary size: 32

Training Word2Vec model...
Generating training data...
Training pairs: 26400
Epoch 1/10, Loss: 2.0622
Epoch 2/10, Loss: 1.8783
Epoch 3/10, Loss: 1.8707
Epoch 4/10, Loss: 1.8637
Epoch 5/10, Loss: 1.8579
Epoch 6/10, Loss: 1.8637
Epoch 7/10, Loss: 1.8564
Epoch 8/10, Loss: 1.8502
Epoch 9/10, Loss: 1.8514
Epoch 10/10, Loss: 1.8531

Testing word similarities:

Most similar to 'quick':
  outsmarts: 0.6379
  high: 0.5505
  over: 0.4867

Most similar to 'lazy':
  mat: 0.5903
  cat: 0.4333
  on: 0.4141

Most similar to 'dog':
  clever: 0.6050
  jumps: 0.5332
  fox: 0.4984

Most similar to 'fox':
  clever: 0.5291
  a: 0.5200
  dog: 0.4984
