**Assignment 3**

These steps will be added later


**SVD**

In [1]:
"""
svd.py
------

Builds a co-occurrence matrix from the preprocessed Brown Corpus and applies SVD
to generate static word embeddings. This script:

1. Preprocesses the Brown corpus with:
   - Lowercasing, stopword removal, lemmatization, rare word removal.
   - Padding short sentences and chunking long ones.
2. Uses the top-K most frequent words (default: 10,000).
3. Runs SVD on CPU (to avoid memory issues on GPU).

Saves the resulting embeddings to 'svd.pt'.

Usage:
    python svd.py
"""

import torch
import nltk
from nltk.corpus import brown, stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict

def downloadNltkResources():
    """
    Downloads the necessary NLTK resources: brown, stopwords, and wordnet.
    """
    nltk.download('brown')
    nltk.download('stopwords')
    nltk.download('wordnet')

def padSentence(sentenceTokens, targetLength, padToken="<PAD>"):
    """
    Pads a tokenized sentence to `targetLength` by adding <PAD> tokens on both sides.

    Args:
        sentenceTokens (list of str): The tokenized sentence.
        targetLength (int): Desired length after padding.
        padToken (str): Token used for padding.

    Returns:
        list of str: The padded sentence.
    """
    paddingNeeded = max(0, targetLength - len(sentenceTokens))
    leftPad = paddingNeeded // 2
    rightPad = paddingNeeded - leftPad
    return [padToken] * leftPad + sentenceTokens + [padToken] * rightPad

def chunkSentence(sentenceTokens, chunkSize, padToken="<PAD>"):
    """
    Splits a sentence into overlapping chunks of size `chunkSize`.
    If the sentence is shorter than chunkSize, it is padded instead.

    Args:
        sentenceTokens (list of str): Tokenized sentence.
        chunkSize (int): Desired chunk size.
        padToken (str): Token used for padding.

    Returns:
        list of list of str: A list of chunked or padded sentences.
    """
    if len(sentenceTokens) <= chunkSize:
        return [padSentence(sentenceTokens, chunkSize, padToken)]
    return [sentenceTokens[i : i + chunkSize]
            for i in range(len(sentenceTokens) - chunkSize + 1)]

def preprocessBrownCorpus(minFreq=5, windowSize=5, maxChunkSize=11):
    """
    Preprocesses the Brown corpus by:
      1. Downloading necessary NLTK resources.
      2. Tokenizing, lowercasing, removing stopwords, and lemmatizing tokens.
      3. Padding short sentences and chunking longer ones.
      4. Removing words below minFreq from the final corpus.

    Args:
        minFreq (int): Minimum frequency for a word to remain in the dataset.
        windowSize (int): Context window size (used to decide how to pad).
        maxChunkSize (int): Maximum size of sentence chunks.

    Returns:
        list of list of str: The preprocessed tokenized sentences.
    """
    downloadNltkResources()
    stopWords = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    padToken = "<PAD>"
    cleanedSentences = []

    # Determine how many tokens we need to accommodate (context window * 2 + 1 for target)
    targetLength = windowSize * 2 + 1

    # Step 1: Read Brown corpus and tokenize
    for rawSentence in brown.sents():
        # Lowercase, remove stopwords, remove non-alpha, and lemmatize
        tokens = [
            lemmatizer.lemmatize(word.lower())
            for word in rawSentence
            if word.isalpha() and word.lower() not in stopWords
        ]

        if not tokens:
            continue

        # Step 2: Pad if shorter, or chunk if longer than maxChunkSize
        if len(tokens) < targetLength:
            # If smaller than the target length, pad to target length
            padded = padSentence(tokens, targetLength, padToken)
            cleanedSentences.append(padded)
        elif len(tokens) > maxChunkSize:
            # If longer, chunk into overlapping windows of size maxChunkSize
            chunks = chunkSentence(tokens, maxChunkSize, padToken)
            cleanedSentences.extend(chunks)
        else:
            # Otherwise, keep as is
            cleanedSentences.append(tokens)

    # Step 3: Filter out rare words below `minFreq`
    wordCounts = Counter(word for sent in cleanedSentences for word in sent)
    validWords = {
        word for word, count in wordCounts.items()
        if count >= minFreq or word == padToken
    }

    # Step 4: Rebuild final sentences, removing words not in validWords
    finalSentences = [[word for word in sent if word in validWords]
                      for sent in cleanedSentences]

    return finalSentences

class SVDModel:
    """
    Class for building a co-occurrence matrix and performing SVD-based embeddings.
    """
    def __init__(self, windowSize=5, embeddingDim=100, maxVocabSize=10000):
        """
        Initializes the SVDModel.

        Args:
            windowSize (int): Number of words to consider on each side of a target word.
            embeddingDim (int): Embedding dimension for the final embeddings.
            maxVocabSize (int): Maximum vocabulary size (top-K by frequency).
        """
        self.windowSize = windowSize
        self.embeddingDim = embeddingDim
        self.maxVocabSize = maxVocabSize
        self.wordToIdx = {}
        self.idxToWord = {}
        self.cooccurMatrix = None
        self.vocabSize = 0

    def buildVocabulary(self, sentences):
        """
        Creates a truncated vocabulary from the preprocessed sentences.

        Args:
            sentences (list of list of str]): Preprocessed sentences.
        """
        wordCounter = Counter(word for sent in sentences for word in sent)
        mostCommon = wordCounter.most_common(self.maxVocabSize)
        vocab = sorted([word for word, _ in mostCommon])

        self.wordToIdx = {word: idx for idx, word in enumerate(vocab)}
        self.idxToWord = {idx: word for word, idx in self.wordToIdx.items()}
        self.vocabSize = len(vocab)
        print(f"Vocabulary size: {self.vocabSize}")

    def buildCooccurrenceMatrix(self, sentences):
        """
        Builds a co-occurrence matrix using a symmetric context window.

        Args:
            sentences (list of list of str]): Preprocessed sentences.
        """
        cooccurDict = defaultdict(float)

        for sent in sentences:
            length = len(sent)
            for i, targetWord in enumerate(sent):
                if targetWord not in self.wordToIdx:
                    continue
                targetIdx = self.wordToIdx[targetWord]
                start = max(0, i - self.windowSize)
                end = min(length, i + self.windowSize + 1)
                for j in range(start, end):
                    if j != i:
                        contextWord = sent[j]
                        if contextWord in self.wordToIdx:
                            contextIdx = self.wordToIdx[contextWord]
                            cooccurDict[(targetIdx, contextIdx)] += 1.0

        # Initialize the co-occurrence matrix
        self.cooccurMatrix = torch.zeros(self.vocabSize, self.vocabSize)
        for (i, j), val in cooccurDict.items():
            self.cooccurMatrix[i, j] = val

        print("Co-occurrence matrix built.")

    def fitSVD(self):
        """
        Performs SVD on the co-occurrence matrix and returns the resulting embeddings.

        Returns:
            Tensor: The computed word embeddings of shape [vocabSize, embeddingDim].
        """
        print("Performing SVD on CPU...")
        U, S, _ = torch.svd(self.cooccurMatrix.cpu())
        sqrtS = torch.diag(torch.sqrt(S))

        # Keep only the top embeddingDim components
        embeddings = torch.mm(U[:, :self.embeddingDim], sqrtS[:self.embeddingDim, :self.embeddingDim])
        print("SVD complete.")
        return embeddings

def main():
    print("Preprocessing Brown corpus...")
    sentences = preprocessBrownCorpus(
        minFreq=5,   # minimum frequency
        windowSize=5,
        maxChunkSize=11
    )

    svdModel = SVDModel(
        windowSize=5,
        embeddingDim=100,
        maxVocabSize=10000
    )

    print("Building vocabulary...")
    svdModel.buildVocabulary(sentences)

    print("Building co-occurrence matrix...")
    svdModel.buildCooccurrenceMatrix(sentences)

    print("Fitting SVD...")
    embeddings = svdModel.fitSVD()

    saveDict = {
        'embeddings': embeddings,
        'wordToIdx': svdModel.wordToIdx,
        'idxToWord': svdModel.idxToWord
    }
    torch.save(saveDict, 'svd.pt')
    print("Saved embeddings to 'svd.pt'.")

if __name__ == "__main__":
    main()


Preprocessing Brown corpus...


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Building vocabulary...
Vocabulary size: 10000
Building co-occurrence matrix...
Co-occurrence matrix built.
Fitting SVD...
Performing SVD on CPU...
SVD complete.
Saved embeddings to 'svd.pt'.


**CBOW**

In [2]:
"""
cbow.py
-------

Implements the Continuous Bag-of-Words (CBOW) model with Negative Sampling using PyTorch.
This script:

1. Preprocesses the Brown corpus by:
   - Lowercasing, stopword removal, lemmatization, rare word removal, etc.
2. Builds a vocabulary (top-K words by frequency).
3. Trains the CBOW model and saves the learned embeddings to 'cbow.pt'.

Usage:
    python cbow.py
"""

import nltk
import torch
from torch import nn
from nltk.corpus import brown, stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# ------------------------ Preprocessing Helpers ------------------------ #

def downloadNltkResources():
    """
    Downloads the necessary NLTK resources: brown, stopwords, and wordnet.
    """
    nltk.download('brown')
    nltk.download('stopwords')
    nltk.download('wordnet')

def filterAndLemmatizeSentence(rawSentence, stopWords, lemmatizer):
    """
    Converts a raw sentence to lowercase, removes stopwords and non-alpha tokens,
    and then lemmatizes the remaining tokens.

    Args:
        rawSentence (list of str): Original sentence tokens from Brown corpus.
        stopWords (set of str): Set of stopwords to remove.
        lemmatizer (WordNetLemmatizer): NLTK lemmatizer instance.

    Returns:
        list of str: The processed tokens.
    """
    return [
        lemmatizer.lemmatize(word.lower())
        for word in rawSentence
        if word.isalpha() and word.lower() not in stopWords
    ]

def preprocessBrownCorpus(
    maxChunkSize=50,
    minFreq=5,
    topVocab=10000
):
    """
    Preprocesses the Brown corpus by:
      1. Downloading necessary NLTK resources.
      2. Filtering and lemmatizing tokens (stopwords removed).
      3. (Optionally) chunking or padding (You can add if needed).
      4. Removing words below a minimum frequency threshold.
      5. Limiting the vocabulary to top 'topVocab' words.

    Args:
        maxChunkSize (int): An optional parameter if you want to chunk very long sentences.
        minFreq (int): Minimum frequency for a word to remain in the dataset.
        topVocab (int): Maximum vocabulary size (top-K words).

    Returns:
        list of list of str: The preprocessed tokenized sentences.
        dict: A mapping from word -> frequency (for building negative sampling distribution).
    """
    downloadNltkResources()
    stopWords = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    processedSentences = []

    # Step 1: Read the Brown corpus and preprocess each sentence
    for rawSentence in brown.sents():
        tokens = filterAndLemmatizeSentence(rawSentence, stopWords, lemmatizer)
        if tokens:
            # Optional: chunk or pad if needed. For now, we just keep the tokens.
            # If a sentence is extremely long, you might want to chunk it:
            if len(tokens) > maxChunkSize:
                # Example chunking in fixed-size blocks
                for i in range(0, len(tokens), maxChunkSize):
                    processedSentences.append(tokens[i : i + maxChunkSize])
            else:
                processedSentences.append(tokens)

    # Step 2: Remove words below minFreq
    fullCounter = Counter(word for sent in processedSentences for word in sent)
    filteredWords = {w for w, c in fullCounter.items() if c >= minFreq}

    filteredSentences = [
        [w for w in sent if w in filteredWords]
        for sent in processedSentences
    ]

    # Step 3: Build top vocabulary list by frequency
    freqCounter = Counter(word for sent in filteredSentences for word in sent)
    mostCommon = freqCounter.most_common(topVocab)
    vocabList = sorted(w for w, _ in mostCommon)  # final vocab

    # Step 4: Re-filter sentences to keep only words in the final top vocab
    finalSentences = [
        [w for w in sent if w in vocabList]
        for sent in filteredSentences
    ]

    # Remove empty sentences
    finalSentences = [sent for sent in finalSentences if len(sent) > 0]

    return finalSentences, freqCounter

# ------------------------ CBOW Dataset & Model ------------------------ #

class CBOWDataset(torch.utils.data.Dataset):
    """
    Dataset for CBOW training. Each sample consists of a list of context word indices
    and a target word index.
    """
    def __init__(self, sentences, wordToIdx, windowSize=2):
        """
        Initializes the CBOW dataset.

        Args:
            sentences (list of list of str]): Tokenized, preprocessed sentences.
            wordToIdx (dict): Mapping from words to indices.
            windowSize (int): Size of the context window on each side.
        """
        super().__init__()
        self.windowSize = windowSize
        self.wordToIdx = wordToIdx
        self.data = []

        for sent in sentences:
            # Make sure we only use words in our vocabulary
            filteredSent = [word for word in sent if word in self.wordToIdx]
            if len(filteredSent) < 2 * windowSize + 1:
                continue

            for i in range(windowSize, len(filteredSent) - windowSize):
                contextIndices = []
                for j in range(i - windowSize, i + windowSize + 1):
                    if j != i:
                        contextIndices.append(self.wordToIdx[filteredSent[j]])
                targetIndex = self.wordToIdx[filteredSent[i]]
                self.data.append((contextIndices, targetIndex))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        contextTensor = torch.tensor(context, dtype=torch.long)
        targetTensor = torch.tensor(target, dtype=torch.long)
        return (contextTensor, targetTensor)

class CBOWModel(nn.Module):
    """
    CBOW model with negative sampling.
    """
    def __init__(self, vocabSize, embeddingDim):
        """
        Initializes the CBOW model.

        Args:
            vocabSize (int): Number of words in the vocabulary.
            embeddingDim (int): Dimensionality of the embeddings.
        """
        super().__init__()
        self.vocabSize = vocabSize
        self.embeddingDim = embeddingDim

        # Embeddings for input (context) and output (target)
        self.inEmbedding = nn.Embedding(vocabSize, embeddingDim)
        self.outEmbedding = nn.Embedding(vocabSize, embeddingDim)

        # Initialize embeddings uniformly
        initRange = 0.5 / embeddingDim
        self.inEmbedding.weight.data.uniform_(-initRange, initRange)
        self.outEmbedding.weight.data.uniform_(-initRange, initRange)

    def forward(self, contextIndices, targetIndices, negativeIndices):
        """
        Forward pass computes the loss using negative sampling.

        Args:
            contextIndices (Tensor): [batchSize, contextWindow] context word indices.
            targetIndices (Tensor): [batchSize] target word indices.
            negativeIndices (Tensor): [batchSize, numNegSamples] negative sample indices.

        Returns:
            loss (Tensor): The computed loss for the batch.
        """
        # contextEmbeds: [B, contextWindow, D]
        contextEmbeds = self.inEmbedding(contextIndices)
        # Average: [B, D]
        contextEmbeds = torch.mean(contextEmbeds, dim=1)

        # Positive scores
        targetEmbeds = self.outEmbedding(targetIndices)  # [B, D]
        positiveScores = torch.sum(contextEmbeds * targetEmbeds, dim=1)  # [B]

        # Negative samples: [B, numNegSamples, D]
        negEmbeds = self.outEmbedding(negativeIndices)
        # [B, numNegSamples]
        negativeScores = torch.bmm(negEmbeds, contextEmbeds.unsqueeze(2)).squeeze(2)

        # Loss computation
        positiveLoss = -torch.log(torch.sigmoid(positiveScores) + 1e-10)
        negativeLoss = -torch.sum(torch.log(torch.sigmoid(-negativeScores) + 1e-10), dim=1)
        loss = torch.mean(positiveLoss + negativeLoss)
        return loss

def generateNegativeSamples(batchSize, numNegSamples, vocabSize, wordFreqs):
    """
    Generates negative samples based on word frequencies.

    Args:
        batchSize (int): Number of samples in the batch.
        numNegSamples (int): Number of negative samples per instance.
        vocabSize (int): Total number of words in the vocabulary.
        wordFreqs (Tensor): Word frequency distribution for sampling.

    Returns:
        Tensor: Negative sample indices with shape [batchSize, numNegSamples].
    """
    # Draw from wordFreqs
    negatives = torch.multinomial(wordFreqs, batchSize * numNegSamples, replacement=True)
    return negatives.view(batchSize, numNegSamples)

# ----------------------------- Main Function ----------------------------- #

def main():
    print("Preprocessing Brown corpus for CBOW...")
    # Example hyperparameters for preprocessing
    maxChunkSize = 50
    minFreq = 5
    topVocab = 10000

    finalSentences, freqCounter = preprocessBrownCorpus(
        maxChunkSize=maxChunkSize,
        minFreq=minFreq,
        topVocab=topVocab
    )

    # Build final vocabulary
    vocabCounter = Counter(word for sent in finalSentences for word in sent)
    mostCommon = vocabCounter.most_common(topVocab)
    vocabList = sorted([word for word, _ in mostCommon])
    wordToIdx = {word: i for i, word in enumerate(vocabList)}
    idxToWord = {i: word for word, i in wordToIdx.items()}
    vocabSize = len(vocabList)
    print(f"Vocabulary size after preprocessing: {vocabSize}")

    # Prepare negative sampling distribution
    counts = torch.tensor([vocabCounter[w] for w in vocabList], dtype=torch.float)
    wordFreqs = counts ** 0.75
    wordFreqs = wordFreqs / torch.sum(wordFreqs)

    # Create the CBOW dataset
    windowSize = 2
    dataset = CBOWDataset(finalSentences, wordToIdx, windowSize=windowSize)

    dataLoader = torch.utils.data.DataLoader(
        dataset,
        batch_size=64,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    # Initialize CBOW model
    embeddingDim = 100
    model = CBOWModel(vocabSize, embeddingDim)

    # Check for GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on device: {device}")
    model.to(device)

    # Optimizer and training hyperparams
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    epochs = 5
    numNegSamples = 5
    print("Starting CBOW training...")

    for epoch in range(epochs):
        totalLoss = 0.0
        for contextIndices, targetIndices in dataLoader:
            contextIndices = contextIndices.to(device)
            targetIndices = targetIndices.to(device)

            negSamples = generateNegativeSamples(
                batchSize=contextIndices.size(0),
                numNegSamples=numNegSamples,
                vocabSize=vocabSize,
                wordFreqs=wordFreqs
            ).to(device)

            loss = model(contextIndices, targetIndices, negSamples)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            totalLoss += loss.item()

        avgLoss = totalLoss / len(dataLoader)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avgLoss:.4f}")

    # Save the trained embeddings
    saveDict = {
        'inEmbedding': model.inEmbedding.weight.data.cpu(),
        'outEmbedding': model.outEmbedding.weight.data.cpu(),
        'wordToIdx': wordToIdx,
        'idxToWord': idxToWord
    }
    torch.save(saveDict, 'cbow.pt')
    print("CBOW embeddings saved to 'cbow.pt'.")

if __name__ == "__main__":
    main()


Preprocessing Brown corpus for CBOW...


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Vocabulary size after preprocessing: 10000
Training on device: cuda
Starting CBOW training...
Epoch 1/5, Average Loss: 2.8530
Epoch 2/5, Average Loss: 2.5681
Epoch 3/5, Average Loss: 2.3535
Epoch 4/5, Average Loss: 2.1866
Epoch 5/5, Average Loss: 2.0362
CBOW embeddings saved to 'cbow.pt'.


**SkipGram**

In [3]:
"""
skipgram.py
-----------

Implements the Skip-Gram model with Negative Sampling using PyTorch. This script:

1. Preprocesses the Brown corpus by:
   - Lowercasing, stopword removal, lemmatization, rare word removal, etc.
2. Builds a vocabulary (top-K words by frequency).
3. Trains the Skip-Gram model and saves the learned embeddings to 'skipgram.pt'.

Usage:
    python skipgram.py
"""

import nltk
import torch
from torch import nn
from nltk.corpus import brown, stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# ------------------------ Preprocessing Helpers ------------------------ #

def downloadNltkResources():
    """
    Downloads the necessary NLTK resources: brown, stopwords, and wordnet.
    """
    nltk.download('brown')
    nltk.download('stopwords')
    nltk.download('wordnet')

def filterAndLemmatizeSentence(rawSentence, stopWords, lemmatizer):
    """
    Converts a raw sentence to lowercase, removes stopwords and non-alpha tokens,
    and then lemmatizes the remaining tokens.

    Args:
        rawSentence (list of str): Original sentence tokens from Brown corpus.
        stopWords (set of str): Set of stopwords to remove.
        lemmatizer (WordNetLemmatizer): NLTK lemmatizer instance.

    Returns:
        list of str: The processed tokens.
    """
    return [
        lemmatizer.lemmatize(word.lower())
        for word in rawSentence
        if word.isalpha() and word.lower() not in stopWords
    ]

def preprocessBrownCorpus(
    maxChunkSize=50,
    minFreq=5,
    topVocab=10000
):
    """
    Preprocesses the Brown corpus by:
      1. Downloading necessary NLTK resources.
      2. Filtering and lemmatizing tokens (stopwords removed).
      3. (Optionally) chunking or splitting if sentences are very long.
      4. Removing words below a minimum frequency threshold.
      5. Limiting the vocabulary to top 'topVocab' words.

    Args:
        maxChunkSize (int): An optional parameter if you want to chunk very long sentences.
        minFreq (int): Minimum frequency for a word to remain in the dataset.
        topVocab (int): Maximum vocabulary size (top-K words).

    Returns:
        list of list of str: The preprocessed tokenized sentences.
        dict: A mapping from word -> frequency (for building negative sampling distribution).
    """
    downloadNltkResources()
    stopWords = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    processedSentences = []

    # Step 1: Read the Brown corpus and preprocess each sentence
    for rawSentence in brown.sents():
        tokens = filterAndLemmatizeSentence(rawSentence, stopWords, lemmatizer)
        if tokens:
            # If the sentence is extremely long, chunk it
            if len(tokens) > maxChunkSize:
                for i in range(0, len(tokens), maxChunkSize):
                    processedSentences.append(tokens[i : i + maxChunkSize])
            else:
                processedSentences.append(tokens)

    # Step 2: Remove words below minFreq
    fullCounter = Counter(word for sent in processedSentences for word in sent)
    filteredWords = {w for w, c in fullCounter.items() if c >= minFreq}

    filteredSentences = [
        [w for w in sent if w in filteredWords]
        for sent in processedSentences
    ]

    # Step 3: Build top vocabulary list by frequency
    freqCounter = Counter(word for sent in filteredSentences for word in sent)
    mostCommon = freqCounter.most_common(topVocab)
    vocabList = sorted(w for w, _ in mostCommon)  # final vocab

    # Step 4: Re-filter sentences to keep only words in the final top vocab
    finalSentences = [
        [w for w in sent if w in vocabList]
        for sent in filteredSentences
    ]

    # Remove empty sentences
    finalSentences = [sent for sent in finalSentences if len(sent) > 0]

    return finalSentences, freqCounter

# ------------------------ Skip-Gram Dataset & Model ------------------------ #

class SkipGramDataset(torch.utils.data.Dataset):
    """
    Dataset for Skip-Gram training. Each sample is a (center, context) word pair.
    """
    def __init__(self, sentences, wordToIdx, windowSize=2):
        """
        Initializes the Skip-Gram dataset.

        Args:
            sentences (list of list of str]): Tokenized, preprocessed sentences.
            wordToIdx (dict): Mapping from words to indices.
            windowSize (int): Size of the context window on each side.
        """
        super().__init__()
        self.windowSize = windowSize
        self.wordToIdx = wordToIdx
        self.data = []

        for sent in sentences:
            # Filter to ensure all words are in vocabulary
            filteredSent = [word for word in sent if word in self.wordToIdx]
            if len(filteredSent) < 2:
                continue

            # For each word in the sentence, treat it as center and gather context
            for i, centerWord in enumerate(filteredSent):
                centerIdx = self.wordToIdx[centerWord]
                start = max(0, i - windowSize)
                end = min(len(filteredSent), i + windowSize + 1)
                for j in range(start, end):
                    if j != i:
                        contextIdx = self.wordToIdx[filteredSent[j]]
                        self.data.append((centerIdx, contextIdx))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns a tuple (centerIndex, contextIndex) as tensors.
        """
        center, context = self.data[idx]
        centerTensor = torch.tensor(center, dtype=torch.long)
        contextTensor = torch.tensor(context, dtype=torch.long)
        return (centerTensor, contextTensor)

class SkipGramModel(nn.Module):
    """
    Skip-Gram model with Negative Sampling.
    """
    def __init__(self, vocabSize, embeddingDim):
        """
        Initializes the Skip-Gram model.

        Args:
            vocabSize (int): Number of words in the vocabulary.
            embeddingDim (int): Dimensionality of the embeddings.
        """
        super().__init__()
        self.vocabSize = vocabSize
        self.embeddingDim = embeddingDim

        # inEmbedding -> center word embedding
        # outEmbedding -> context (target) word embedding
        self.inEmbedding = nn.Embedding(vocabSize, embeddingDim)
        self.outEmbedding = nn.Embedding(vocabSize, embeddingDim)

        initRange = 0.5 / embeddingDim
        self.inEmbedding.weight.data.uniform_(-initRange, initRange)
        self.outEmbedding.weight.data.uniform_(-initRange, initRange)

    def forward(self, centerIndices, contextIndices, negativeIndices):
        """
        Forward pass computes the Skip-Gram loss using negative sampling.

        Args:
            centerIndices (Tensor): [batchSize] center word indices.
            contextIndices (Tensor): [batchSize] context word indices.
            negativeIndices (Tensor): [batchSize, numNegSamples] negative word indices.

        Returns:
            loss (Tensor): Computed loss for the batch.
        """
        # centerEmbeds: [B, D]
        centerEmbeds = self.inEmbedding(centerIndices)
        # contextEmbeds: [B, D]
        contextEmbeds = self.outEmbedding(contextIndices)
        # Positive scores
        positiveScores = torch.sum(centerEmbeds * contextEmbeds, dim=1)

        # Negative samples: [B, numNegSamples, D]
        negEmbeds = self.outEmbedding(negativeIndices)
        # Dot products: [B, numNegSamples]
        negativeScores = torch.bmm(negEmbeds, centerEmbeds.unsqueeze(2)).squeeze(2)

        # Compute losses
        positiveLoss = -torch.log(torch.sigmoid(positiveScores) + 1e-10)
        negativeLoss = -torch.sum(
            torch.log(torch.sigmoid(-negativeScores) + 1e-10), dim=1
        )
        loss = torch.mean(positiveLoss + negativeLoss)
        return loss

def generateNegativeSamples(batchSize, numNegSamples, vocabSize, wordFreqs):
    """
    Generates negative samples for Skip-Gram based on word frequencies.

    Args:
        batchSize (int): Number of samples in the batch.
        numNegSamples (int): Number of negative samples per instance.
        vocabSize (int): Total number of words in the vocabulary.
        wordFreqs (Tensor): Word frequency distribution.

    Returns:
        Tensor: Negative sample indices with shape [batchSize, numNegSamples].
    """
    negatives = torch.multinomial(wordFreqs, batchSize * numNegSamples, replacement=True)
    return negatives.view(batchSize, numNegSamples)

# ----------------------------- Main Function ----------------------------- #

def main():
    print("Preprocessing Brown corpus for Skip-Gram...")
    # Example hyperparameters for preprocessing
    maxChunkSize = 50
    minFreq = 5
    topVocab = 10000

    finalSentences, freqCounter = preprocessBrownCorpus(
        maxChunkSize=maxChunkSize,
        minFreq=minFreq,
        topVocab=topVocab
    )

    # Build final vocabulary
    vocabCounter = Counter(word for sent in finalSentences for word in sent)
    mostCommon = vocabCounter.most_common(topVocab)
    vocabList = sorted([word for word, _ in mostCommon])
    wordToIdx = {word: i for i, word in enumerate(vocabList)}
    idxToWord = {i: word for word, i in wordToIdx.items()}
    vocabSize = len(vocabList)
    print(f"Vocabulary size after preprocessing: {vocabSize}")

    # Prepare negative sampling distribution
    counts = torch.tensor([vocabCounter[w] for w in vocabList], dtype=torch.float)
    wordFreqs = counts ** 0.75
    wordFreqs = wordFreqs / torch.sum(wordFreqs)

    # Create the Skip-Gram dataset
    windowSize = 2
    dataset = SkipGramDataset(finalSentences, wordToIdx, windowSize=windowSize)
    dataLoader = torch.utils.data.DataLoader(
        dataset,
        batch_size=64,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    # Initialize Skip-Gram model
    embeddingDim = 100
    model = SkipGramModel(vocabSize, embeddingDim)

    # Check for GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on device: {device}")
    model.to(device)

    # Optimizer and training hyperparams
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    epochs = 5
    numNegSamples = 5
    print("Starting Skip-Gram training...")

    for epoch in range(epochs):
        totalLoss = 0.0
        for centerIndices, contextIndices in dataLoader:
            batchSize = centerIndices.size(0)
            centerIndices = centerIndices.to(device)
            contextIndices = contextIndices.to(device)

            negSamples = generateNegativeSamples(
                batchSize=batchSize,
                numNegSamples=numNegSamples,
                vocabSize=vocabSize,
                wordFreqs=wordFreqs
            ).to(device)

            loss = model(centerIndices, contextIndices, negSamples)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            totalLoss += loss.item()

        avgLoss = totalLoss / len(dataLoader)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avgLoss:.4f}")

    # Save the trained embeddings
    saveDict = {
        'inEmbedding': model.inEmbedding.weight.data.cpu(),
        'outEmbedding': model.outEmbedding.weight.data.cpu(),
        'wordToIdx': wordToIdx,
        'idxToWord': idxToWord
    }
    torch.save(saveDict, 'skipgram.pt')
    print("Skip-Gram embeddings saved to 'skipgram.pt'.")

if __name__ == "__main__":
    main()


Preprocessing Brown corpus for Skip-Gram...


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Vocabulary size after preprocessing: 10000
Training on device: cuda
Starting Skip-Gram training...
Epoch 1/5, Average Loss: 2.6347
Epoch 2/5, Average Loss: 2.3579
Epoch 3/5, Average Loss: 2.2121
Epoch 4/5, Average Loss: 2.1088
Epoch 5/5, Average Loss: 2.0386
Skip-Gram embeddings saved to 'skipgram.pt'.
