## Model


In [10]:
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import transformers

## Naive Tokenizer

In [41]:
# Generate word counts (OPTIONAL)
all_words = [w for w in open("text8.txt").read().split(" ") if len(w) > 0]

print("Total word counts: {}".format(len(all_words)))

word_counts = {}
for word in all_words:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

word_counts_df = pd.DataFrame(
    {
        "word": word_counts.keys(),
        "count": word_counts.values(),
    }
)
word_counts_df.sort_values("count", ascending=False, inplace=True)

word_counts_df.to_csv("word_counts.csv", index=False)


Total word counts: 17005207


In [46]:
# Generate vocab (OPTIONAL)
word_counts_df = pd.read_csv("word_counts.csv")

TOP_WORDS = 45000
UNKNOWN_TOKEN = "[UNK]"

WORD_COUNT_THRESHOLD = 10
other_words = word_counts_df[word_counts_df["count"] < WORD_COUNT_THRESHOLD]
unknown_count = other_words["count"].sum()

vocab = pd.concat([
    word_counts_df[word_counts_df["count"] >= WORD_COUNT_THRESHOLD],
    pd.DataFrame([
        {
            "word": UNKNOWN_TOKEN,
            "count": unknown_count,
        }
    ])
])

vocab["token_index"] = range(len(vocab))
vocab.set_index("word", inplace=True)

print(vocab.tail(5))
vocab.to_csv("vocab.csv")


                count  token_index
word                              
thaws               5        71286
bosonic             5        71287
ginnungagap         5        71288
videocassette       5        71289
[UNK]          286363        71290


In [76]:
# Load vocabulary
vocab = pd.read_csv("vocab.csv")
vocab.set_index("word", inplace=True)
UNKNOWN_TOKEN = "[UNK]"
UNKNOWN_INDEX = vocab.loc[UNKNOWN_TOKEN, "token_index"]

vocab_lookup = {
    word: row["token_index"] for word, row in vocab.iterrows()
}
def word_to_token_index(word):
    if word in vocab_lookup:
        return vocab_lookup[word]
    else:
        return UNKNOWN_INDEX

print("\"hello world\" is: [{}, {}]".format(
    word_to_token_index("hello"),
    word_to_token_index("world")
))

"hello world" is: [6425, 70]


In [78]:
# Create subsampled text

import math, random

total_word_count = vocab["count"].sum()
vocab_count = {
    word: row["count"] for word, row in vocab.iterrows()
}

def probability_to_keep_word(word):
    if word in vocab_count:
        word_frequency = vocab_count[word] / total_word_count
        return min(1.0, math.sqrt(0.00001 / word_frequency))
    else:
        return 1

print("Probability to keep the={} wheelbarrow={}".format(probability_to_keep_word("the"), probability_to_keep_word("wheelbarrow")))

cleaned_words = []

print("Total words: {}".format(total_word_count))

for i, w in enumerate(open("text8.txt").read().split(" ")):
    if len(w) > 0 and random.random() <= probability_to_keep_word(w):
        cleaned_words.append(w)

print("Total words (before subsampling): {}".format(total_word_count))
print("Total words (after subsampling): {}".format(len(cleaned_words)))

has_prev_word = False
with open("text8-subsampled.txt", "w") as f:
    for w in cleaned_words:
        if has_prev_word:
            f.write(" ")
        f.write(w)
        has_prev_word = True


Probability to keep the=0.012657625384224533 wheelbarrow=1.0
Total words: 17005207
Total words (before cleaning): 17005207
Total words (after cleaning): 4978951


In [None]:
# PyTorch Model

subsampled_words = [w for w in open("text8-subsampled.txt").read().split(" ") if len(w) > 0]

# => Create a DataLoader for training samples
# => Define a model
# => Training loop
# => Saving/loading the parameters