<a href="https://colab.research.google.com/github/dsogden/Bigram/blob/main/Bigram_Toy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
file_path = '/content/names.txt'
words = []
with open(file_path, 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

In [3]:
vocab = set()
for word in words:
    for char in word:
        vocab.add(char)
vocab = sorted(list(vocab))
print(vocab, len(vocab))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 26


In [4]:
encoder = {char: idx + 1 for idx, char in enumerate(vocab)}
encoder['.'] = 0
decoder = {idx: char for idx, char in enumerate(vocab)}

In [5]:
X, y = [], []
offset = 3
sample = words
for word in sample:
    context = [0] * offset
    for char in word + '.':
        idx = encoder[char]
        X.append(context)
        y.append(idx)
        # print(context, encoder[char])
        context = context[1:] + [idx]
X, y = torch.tensor(X), torch.tensor(y)

In [6]:
X.shape, y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [7]:
X[:10], y[:10]

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22]]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9]))

In [8]:
class Bigram(nn.Module):
    def __init__(
            self, block_size, num_embeddings, embedding_dim, hidden_dim, output_dim
        ):
        super(Bigram, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.hidden = nn.Linear(embedding_dim * block_size, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs):
        embed = self.embedding(inputs)
        proj = self.hidden(torch.cat(torch.unbind(embed, 1), 1))
        output = self.output(F.tanh(proj))
        return output

In [9]:
from tqdm import tqdm
device = 'cuda:0'
torch.cuda.device(device)

X_train = X[:int(0.1 * X.shape[0])]
y_train = y[:int(0.1 * y.shape[0])]

block_size = offset
num_embeddings = len(vocab) + 1
embedding_dim = 5
hidden_dim = 100
output_dim = len(vocab) + 1
model = Bigram(
    block_size, num_embeddings, embedding_dim, hidden_dim, output_dim
).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=3e-4)
epochs = 200
batch_size = 128
for epoch in tqdm(range(epochs)):
    running_loss = 0
    for batch in range(X.shape[0] - batch_size):
        x_ = X[batch: batch + batch_size]
        y_ = y[batch: batch + batch_size]
        preds = model(x_.to(device))
        loss = loss_fn(preds, y_.to(device))
        running_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    running_loss /= batch + 1

    if epoch % 10 == 0:
        print(f'Epoch: {epoch}, loss = {running_loss:0.4f}')