## Model


In [10]:
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import transformers

## Naive Tokenizer

In [41]:
# Generate word counts (OPTIONAL)
all_words = [w for w in open("text8.txt").read().split(" ") if len(w) > 0]

print("Total word counts: {}".format(len(all_words)))

word_counts = {}
for word in all_words:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

word_counts_df = pd.DataFrame(
    {
        "word": word_counts.keys(),
        "count": word_counts.values(),
    }
)
word_counts_df.sort_values("count", ascending=False, inplace=True)

word_counts_df.to_csv("word_counts.csv", index=False)


Total word counts: 17005207


In [46]:
# Generate vocab (OPTIONAL)
word_counts_df = pd.read_csv("word_counts.csv")

TOP_WORDS = 45000
UNKNOWN_TOKEN = "[UNK]"

WORD_COUNT_THRESHOLD = 10
other_words = word_counts_df[word_counts_df["count"] < WORD_COUNT_THRESHOLD]
unknown_count = other_words["count"].sum()

vocab = pd.concat([
    word_counts_df[word_counts_df["count"] >= WORD_COUNT_THRESHOLD],
    pd.DataFrame([
        {
            "word": UNKNOWN_TOKEN,
            "count": unknown_count,
        }
    ])
])

vocab["token_index"] = range(len(vocab))
vocab.set_index("word", inplace=True)

print(vocab.tail(5))
vocab.to_csv("vocab.csv")


                count  token_index
word                              
thaws               5        71286
bosonic             5        71287
ginnungagap         5        71288
videocassette       5        71289
[UNK]          286363        71290


In [76]:
# Load vocabulary
vocab = pd.read_csv("vocab.csv")
vocab.set_index("word", inplace=True)
UNKNOWN_TOKEN = "[UNK]"
UNKNOWN_INDEX = vocab.loc[UNKNOWN_TOKEN, "token_index"]

vocab_lookup = {
    word: row["token_index"] for word, row in vocab.iterrows()
}
def word_to_token_index(word):
    if word in vocab_lookup:
        return vocab_lookup[word]
    else:
        return UNKNOWN_INDEX

print("\"hello world\" is: [{}, {}]".format(
    word_to_token_index("hello"),
    word_to_token_index("world")
))

"hello world" is: [6425, 70]


In [78]:
# Create subsampled text

import math, random

total_word_count = vocab["count"].sum()
vocab_count = {
    word: row["count"] for word, row in vocab.iterrows()
}

def probability_to_keep_word(word):
    if word in vocab_count:
        word_frequency = vocab_count[word] / total_word_count
        return min(1.0, math.sqrt(0.00001 / word_frequency))
    else:
        return 1

print("Probability to keep the={} wheelbarrow={}".format(probability_to_keep_word("the"), probability_to_keep_word("wheelbarrow")))

cleaned_words = []

print("Total words: {}".format(total_word_count))

for i, w in enumerate(open("text8.txt").read().split(" ")):
    if len(w) > 0 and random.random() <= probability_to_keep_word(w):
        cleaned_words.append(w)

print("Total words (before subsampling): {}".format(total_word_count))
print("Total words (after subsampling): {}".format(len(cleaned_words)))

has_prev_word = False
with open("text8-subsampled.txt", "w") as f:
    for w in cleaned_words:
        if has_prev_word:
            f.write(" ")
        f.write(w)
        has_prev_word = True


Probability to keep the=0.012657625384224533 wheelbarrow=1.0
Total words: 17005207
Total words (before cleaning): 17005207
Total words (after cleaning): 4978951


In [None]:
# PyTorch Model
from torch.utils.data import StackDataset, TensorDataset, DataLoader

subsampled_words = [w for w in open("text8-subsampled.txt").read().split(" ") if len(w) > 0]

TAKE_WORDS_BEFORE = 2
TAKE_WORDS_AFTER = 2
MAX_OFFSET = len(subsampled_words) - 1 - TAKE_WORDS_AFTER

def to_tokens(words):
    return [word_to_token_index(word) for word in words]

def generate_data_set(token_source: list[int], token_count_before: int, token_count_after: int):
    min_offset = token_count_before
    max_offset = len(token_source) - 1 - token_count_after

    bags = torch.zeros(max_offset - min_offset, token_count_before + token_count_after, dtype=torch.long)
    targets = []

    ## TODO >> PICK UP FROM HERE
    ## I'm tweaking the below to create the tensor directly for efficiency
    for target_offset in range(min_offset, max_offset):
        target_token = token_source[target_offset]
        token_bag = []
        before_offset = target_offset - token_count_before
        for i in range(token_count_before):
            token_bag.append(token_source[before_offset + i])
        after_offset = target_offset + 1
        for i in range(token_count_after):
            token_bag.append(token_source[after_offset + i])

        bags.append(torch.tensor(token_bag))
        targets.append(torch.tensor(target_token))

    return StackDataset(
        bags=TensorDataset(torch.tensor(bags)),
        targets=TensorDataset(torch.tensor(targets))
    )


# class TokenBagDatasetGenerator(IterableDataset):
#     def __init__(self, token_source: list[int], token_count_before: int, token_count_after: int) -> None:
#         super().__init__()
#         self.token_source = token_source
#         self.token_count_before = token_count_before
#         self.token_count_after = token_count_after
#         self.min_offset = token_count_before
#         self.max_offset = len(token_source) - 1 - token_count_after
#
#     def generate(self):
#         for target_offset in range(self.min_offset, self.max_offset):
#             target_offset = random.randint(self.min_offset, self.max_offset)
#             target_token = self.token_source[target_offset]
#             token_bag = []
#             before_offset = target_offset - self.token_count_before
#             for i in range(self.token_count_before):
#                 token_bag.append(self.token_source[before_offset + i])
#             after_offset = target_offset + 1
#             for i in range(self.token_count_after):
#                 token_bag.append(self.token_source[after_offset + i])
#
#             yield {
#                 "bag": torch.tensor(token_bag),
#                 "target": target_token,
#             }
#
#     def __iter__(self):
#         return iter(self.generate())

# test_loader = TokenBagLoader(
#     token_source=to_tokens("the quick brown fox jumps over the lazy dog".split(" ")),
#     token_count_before=2,
#     token_count_after=1,
# )
# test_loader_iter = test_loader.__iter__()
# print(test_loader_iter.__next__())
# print(test_loader_iter.__next__())


training_count = int(len(subsampled_words) * 0.8)
training_words = subsampled_words[:training_count]

train_data_set = generate_data_set(
    token_source=to_tokens(training_words),
    token_count_before=2,
    token_count_after=2,
)
train_loader = DataLoader(train_data_set, batch_size=64, shuffle=True)

# => Create a DataLoader for training samples
# => Define a model
# => Training loop
# => Saving/loading the parameters


In [108]:
import torch.optim as optim

class Word2VecNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
        )
        self.prediction = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        # Input dimensions: (batch, bag_item) => TokenIndex
        embeddings = self.embedding(x) # Dimensions: (batch, bag_item, embedding) => FeatureWeight
        average_embedding = torch.mean(embeddings, dim=0) # Dimensions: (batch, embedding) => FeatureWeight
        return self.prediction(average_embedding) # Dimensions: (batch, vocab_size) => Logit

BEFORE_WORDS = 2
AFTER_WORDS = 2
BAG_SIZE = BEFORE_WORDS + AFTER_WORDS
EMBEDDING_DIM = 300
VOCAB_SIZE = len(vocab_lookup)

net = Word2VecNetwork(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [110]:
for epoch in range(50):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(data["bags"])
        loss = criterion(outputs, data["targets"])
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 1000 == 999:    # print every 1000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list