In [1]:
import os
import numpy as np
import random
import re
import pandas as pd
from collections import defaultdict, Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

torch.device("cuda" if torch.cuda.is_available() else "cpu")

True
NVIDIA GeForce RTX 3050 Laptop GPU


device(type='cuda')

In [3]:
def load_and_tokenize_texts(folder_path=".", num_files=12):
    all_words = []
    for i in range(1, num_files + 1):
        file_path = os.path.join(folder_path, f"{i}.txt")
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read().lower()
            text = re.sub(r"[^a-z\s]", "", text)
            words = text.split()
            all_words.extend(words)
    print(f"Number of tokens: {len(all_words)}, Size: {len(all_words) * 5 / (1024 * 1024):.2f} MB")
    return all_words

In [4]:
def build_vocab(words, min_count=5):
    word_freq = Counter(words)
    vocab = {word for word, freq in word_freq.items() if freq >= min_count}
    word2idx = {word: i for i, word in enumerate(vocab)}
    return word2idx


In [5]:
def generate_skipgram_pairs(words, word2idx, window_size=2):
    pairs = []
    for i, center in enumerate(words):
        if center not in word2idx:
            continue
        window = random.randint(1, window_size)
        for j in range(i - window, i + window + 1):
            if j != i and 0 <= j < len(words):
                context = words[j]
                if context in word2idx:
                    pairs.append((word2idx[center], word2idx[context]))
    return pairs

In [6]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.input_embeddings = nn.Linear(vocab_size, embedding_dim, bias=False)
        self.output_embeddings = nn.Linear(embedding_dim, vocab_size, bias=False)

    def forward(self, x):
        h = self.input_embeddings(x)
        out = self.output_embeddings(h)
        return out

    def get_embeddings(self):
        return self.input_embeddings.weight.detach().T


In [16]:
def train_skipgram_model(pairs, vocab_size, embedding_dim=100, epochs=5, batch_size=512):
    model = SkipGramModel(vocab_size, embedding_dim)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        random.shuffle(pairs)
        losses = []
        for i in tqdm(range(0, len(pairs), batch_size)):
            batch = pairs[i:i+batch_size]
            inputs, targets = zip(*batch)
            x = torch.zeros(len(inputs), vocab_size)
            for i, idx in enumerate(inputs):
                x[i, idx] = 1.0
            y = torch.tensor(targets)

            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

        print(f"Epoch {epoch+1}, Loss: {np.mean(losses):.4f}")
    return model

In [17]:
def load_wordsim353(filepath="combined.csv"):
    df = pd.read_csv(filepath)
    df["similarity"] = MinMaxScaler().fit_transform(df[["Human (mean)"]])
    return df[["Word 1", "Word 2", "similarity"]]


In [18]:
def check_wordsim_coverage(wordsim, word2idx):
    in_vocab = 0
    total = 0
    for _, row in wordsim.iterrows():
        w1, w2 = row["Word 1"].lower(), row["Word 2"].lower()
        if w1 in word2idx and w2 in word2idx:
            in_vocab += 1
        total += 1
    print(f"WordSim-353 coverage: {in_vocab}/{total} pairs ({in_vocab / total * 100:.2f}%)")


In [19]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-8)


In [20]:
def evaluate_model(model, word2idx, wordsim):
    embeddings = model.get_embeddings().numpy()
    preds, targets = [], []
    for _, row in wordsim.iterrows():
        w1, w2 = row["Word 1"].lower(), row["Word 2"].lower()
        if w1 in word2idx and w2 in word2idx:
            v1 = embeddings[word2idx[w1]]
            v2 = embeddings[word2idx[w2]]
            sim = cosine_similarity(v1, v2)
            preds.append(sim)
            targets.append(row["similarity"])
    mse = mean_squared_error(targets, preds)
    return mse

In [21]:
def build_bow_vectors(words, word2idx, window_size=2):
    bow = defaultdict(lambda: np.zeros(len(word2idx)))
    for i in range(len(words)):
        if words[i] not in word2idx:
            continue
        center = word2idx[words[i]]
        for j in range(i - window_size, i + window_size + 1):
            if j != i and 0 <= j < len(words) and words[j] in word2idx:
                context = word2idx[words[j]]
                bow[center][context] += 1
    return bow


In [22]:
def evaluate_bow_model(bow_vectors, word2idx, wordsim):
    preds, targets = [], []
    for _, row in wordsim.iterrows():
        w1, w2 = row["Word 1"].lower(), row["Word 2"].lower()
        if w1 in word2idx and w2 in word2idx:
            v1 = bow_vectors[word2idx[w1]]
            v2 = bow_vectors[word2idx[w2]]
            sim = cosine_similarity(v1, v2)
            preds.append(sim)
            targets.append(row["similarity"])
    mse = mean_squared_error(targets, preds)
    return mse

In [23]:
if __name__ == "__main__":
    words = load_and_tokenize_texts(folder_path=".", num_files=12)

    word2idx = build_vocab(words)
    print("Vocabulary size:", len(word2idx))

    pairs = generate_skipgram_pairs(words, word2idx, window_size=2)
    print("Number of skip-gram pairs:", len(pairs))

    model = train_skipgram_model(pairs, vocab_size=len(word2idx), embedding_dim=100)

    wordsim = load_wordsim353("combined.csv")
    check_wordsim_coverage(wordsim, word2idx)

    mse_embed = evaluate_model(model, word2idx, wordsim)
    print(f"Word2Vec model MSE: {mse_embed:.4f}")

    bow_vectors = build_bow_vectors(words, word2idx, window_size=2)
    mse_bow = evaluate_bow_model(bow_vectors, word2idx, wordsim)
    print(f"Bag-of-Words model MSE: {mse_bow:.4f}")


Number of tokens: 2128473, Size: 10.15 MB
Vocabulary size: 17940
Number of skip-gram pairs: 5987956


100%|██████████| 11696/11696 [13:02<00:00, 14.96it/s]


Epoch 1, Loss: 6.5821


100%|██████████| 11696/11696 [08:00<00:00, 24.36it/s]


Epoch 2, Loss: 6.3076


100%|██████████| 11696/11696 [07:59<00:00, 24.37it/s]


Epoch 3, Loss: 6.2282


100%|██████████| 11696/11696 [07:58<00:00, 24.43it/s]


Epoch 4, Loss: 6.1758


100%|██████████| 11696/11696 [07:57<00:00, 24.49it/s]


Epoch 5, Loss: 6.1364
WordSim-353 coverage: 230/353 pairs (65.16%)
Word2Vec model MSE: 0.1169
Bag-of-Words model MSE: 0.0820
