## Model


In [10]:
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import transformers

## Naive Tokenizer

In [14]:
# Generate word counts (OPTIONAL)
all_text = []
all_words = [w for w in open("text8.txt").read().split(" ") if len(w) > 0]

word_counts = {}
for word in all_words:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

word_counts_df = pd.DataFrame(
    {
        "word": word_counts.keys(),
        "count": word_counts.values(),
    }
)
word_counts_df.sort_values("count", ascending=False, inplace=True)

word_counts_df.to_csv("word_counts.csv", index=False)


In [33]:
# Generate vocab (OPTIONAL)
word_counts_df = pd.read_csv("word_counts.csv")

TOP_WORDS = 30000
UNKNOWN_TOKEN = "[UNK]"

other_words = word_counts_df.iloc[:TOP_WORDS]
unknown_count = other_words["count"].sum()

vocab = pd.concat([
    word_counts_df.head(TOP_WORDS),
    pd.DataFrame([
        {
            "word": UNKNOWN_TOKEN,
            "count": unknown_count,
        }
    ])
])

vocab["token_index"] = range(len(vocab))
vocab.set_index("word", inplace=True)

print(vocab.tail(5))
vocab.to_csv("vocab.csv")

def word_to_token_index(word):
    if word in vocab:
        return vocab.loc[word]["token_index"]
    else:
        return UNKNOWN_INDEX


               count  token_index
word                             
prosthetic        22        29996
sedative          22        29997
mnemonics         22        29998
lise              22        29999
[UNK]       16315126        30000


In [40]:
# Load vocabulary
vocab = pd.read_csv("vocab.csv")
vocab.set_index("word", inplace=True)
UNKNOWN_TOKEN = "[UNK]"
UNKNOWN_INDEX = vocab.loc[UNKNOWN_TOKEN, "token_index"]

def word_to_token_index(word):
    if word in vocab.index:
        return vocab.loc[word]["token_index"]
    else:
        return UNKNOWN_INDEX


print("Hello world is: {} {}".format(word_to_token_index("hello"), word_to_token_index("world")))

Hello world is: 6425 70
