In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tiktoken

In [63]:
block_size = 8
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device", device)
eval_interval = 100000

device cuda


In [38]:
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [19]:
enc = tiktoken.get_encoding("cl100k_base")
old_tokens = enc.encode(data)
old_tokens[:100]

[5451,
 47317,
 512,
 10438,
 584,
 10570,
 904,
 4726,
 11,
 6865,
 757,
 6604,
 382,
 2460,
 512,
 96945,
 11,
 6604,
 382,
 5451,
 47317,
 512,
 2675,
 527,
 682,
 20250,
 4856,
 311,
 2815,
 1109,
 311,
 2138,
 819,
 1980,
 2460,
 512,
 66494,
 13,
 20250,
 382,
 5451,
 47317,
 512,
 5451,
 11,
 499,
 1440,
 356,
 2192,
 355,
 2947,
 5979,
 355,
 374,
 10388,
 9354,
 311,
 279,
 1274,
 382,
 2460,
 512,
 1687,
 1440,
 956,
 11,
 584,
 1440,
 956,
 382,
 5451,
 47317,
 512,
 10267,
 603,
 5622,
 1461,
 11,
 323,
 584,
 3358,
 617,
 14095,
 520,
 1057,
 1866,
 3430,
 627,
 3957,
 956,
 264,
 36543,
 1980,
 2460,
 512,
 2822,
 810,
 7556,
 389,
 956]

In [21]:
old2new = {x:i for i, x in enumerate(set(old_tokens))}
new2old = {old2new[i]:i for i in old2new.keys()}
tokens = [old2new[x] for x in old_tokens]

In [22]:
enc.decode([new2old[x] for x in tokens[:100]])

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't"

In [23]:
data = torch.tensor(tokens, dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([301829]) torch.int64
tensor([ 3488,  7161,   349,  5685,   408,  5738,   678,  3131,     5,  4145,
          552,  4028,   233,  1797,   349, 11796,     5,  4028,   233,  3488,
         7161,   349,  1922,   360,   493,  8942,  3201,   164,  2015,   844,
          164,  1604,   604,  1501,  1797,   349,   723,     7,  8942,   233,
         3488,  7161,   349,  3488,     5,   339,  1095,   207,  1639,   206,
         2099,  3747,   206,   225,  5666,  5242,   164,   136,   970,   233,
         1797,   349,  1285,  1095,   722,     5,   408,  1095,   722,   233,
         3488,  7161,   349,  5618,   423,  3568,  1110,     5,   175,   408,
         2351,   434,  7008,   354,   798,  1407,  2398,   443,  2695,   722,
          121,  2590,  1501,  1797,   349,  2019,   596,  4459,   240,   722,
           12,   832,   278,   238,  2063,    11,  2255,     5,  2255,  1687,
         7671,  7161,   349,  2763,  2437,     5,  1292,  5710,   233,  3488,
         7161,   349,  1285,   

In [25]:
# Let's now split up the data into train and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [26]:

print(train_data[:block_size+1])

tensor([3488, 7161,  349, 5685,  408, 5738,  678, 3131,    5])


In [27]:
unique_tokens = list(set(tokens))
vocab_size = len(unique_tokens)
vocab_size

12111

In [24]:
max(tokens)

12110

In [28]:
embed_size = 5
embeds = nn.Embedding(vocab_size, embed_size)

In [30]:
# only works with strings from shakespeare, due to limited tokens
my_str = "Before we proceed any further, hear me speak."
embeds(torch.tensor([old2new[x] for x in enc.encode(my_str)]))

tensor([[-0.0955,  1.1293,  0.4822, -1.2058, -0.6945],
        [-1.1285,  0.5376, -0.7270, -0.5792, -1.4454],
        [-0.6640,  0.0040,  0.7976, -1.7570,  0.1647],
        [ 0.0354, -0.9974, -1.7675,  0.5055, -0.4660],
        [-0.4685, -0.8378,  1.4380,  0.6492, -0.8009],
        [-1.7285, -0.7581, -0.1139, -0.0851,  0.0438],
        [-0.7459, -3.4173, -0.7927,  0.1880,  0.1063],
        [-0.4173,  0.9136,  0.3821, -0.1080, -0.1367],
        [-0.5877, -0.2802,  0.3289,  0.1856, -0.0176],
        [-0.5717, -0.1055,  0.9487,  1.4674, -1.1212]],
       grad_fn=<EmbeddingBackward0>)

In [33]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [34]:
context_size = 2
embedding_dim = 10


In [35]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [77]:
def encode(s):
    return [old2new[x] for x in enc.encode(s)]

def decode(s):
    return enc.decode([new2old[x] for x in s])

In [66]:
ngrams = [
    (
        train_data[i - context_size:i],
        train_data[i]
    ) for i in range(context_size, len(train_data)//100)
]

In [51]:
ngrams[:4]

[(tensor([3488, 7161]), tensor(349)),
 (tensor([7161,  349]), tensor(5685)),
 (tensor([ 349, 5685]), tensor(408)),
 (tensor([5685,  408]), tensor(5738))]

In [67]:
len(ngrams)

2714

In [39]:
encode("thou")

[191, 140]

In [59]:
torch.tensor(349).view(1)

tensor([349])

In [None]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(vocab_size, embedding_dim, context_size)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [None]:
model.load_state_dict(torch.load("embedding_model.pt"))

In [69]:

for epoch in range(50):
    total_loss = 0
    for i, (context, target) in enumerate(ngrams):

        if i % eval_interval == 0:
            print(f"percent done w/ epoch: {i/len(ngrams)}")
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = context

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, target.view(1))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)

    print(f"epoch {epoch} loss: {total_loss}")
print(losses)  # The loss decreased every iteration over the training data!


percent done w/ epoch: 0.0
epoch 0 loss: 25284.066875457764
percent done w/ epoch: 0.0
epoch 1 loss: 24546.339502334595
percent done w/ epoch: 0.0
epoch 2 loss: 23708.1665725708
percent done w/ epoch: 0.0
epoch 3 loss: 22777.464068889618
percent done w/ epoch: 0.0
epoch 4 loss: 21875.829502224922
percent done w/ epoch: 0.0
epoch 5 loss: 21077.029460787773
percent done w/ epoch: 0.0
epoch 6 loss: 20386.74539422989
percent done w/ epoch: 0.0
epoch 7 loss: 19792.52147525549
percent done w/ epoch: 0.0
epoch 8 loss: 19270.021094292402
percent done w/ epoch: 0.0
epoch 9 loss: 18793.828703939915
percent done w/ epoch: 0.0
epoch 10 loss: 18352.76360039413
percent done w/ epoch: 0.0
epoch 11 loss: 17943.050711661577
percent done w/ epoch: 0.0
epoch 12 loss: 17561.66039097309
percent done w/ epoch: 0.0
epoch 13 loss: 17204.89176040888
percent done w/ epoch: 0.0
epoch 14 loss: 16870.611566111445
percent done w/ epoch: 0.0
epoch 15 loss: 16557.886550411582
percent done w/ epoch: 0.0
epoch 16 loss:

In [79]:
torch.save(model.state_dict(), "embedding_model.pt")

In [72]:
encode("Before")

[5685]

In [73]:

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[encode("Before")])

tensor([[ 0.6201, -0.4573,  1.1093,  0.8587, -0.7115, -0.6778, -0.8076, -1.2620,
         -0.6081,  0.3496]], grad_fn=<IndexBackward0>)


In [76]:
model.embeddings.weight[encode("king")] - model.embeddings.weight[encode("prince")]

tensor([[-0.1101, -0.3134,  1.4419,  1.5636,  0.1379, -1.6653, -0.1676,  0.4602,
          1.1939,  0.5346],
        [-0.2401,  1.8910,  1.4143,  0.6424,  1.7328, -2.9237, -0.3027, -1.7330,
         -0.4693,  0.7559]], grad_fn=<SubBackward0>)