In [None]:
with open('quixote.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [None]:
print("Length of dataset in characters:", len(text))

Length of dataset in characters: 2166999


In [None]:
chars = sorted(list(set(text)))
vocabSize = len(chars)
print(''.join(chars))
print(vocabSize)


 !"'(),-.01246:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
71


In [None]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("Hello World"))
print(decode(encode("Hello World")))

[26, 49, 56, 56, 59, 1, 41, 59, 62, 56, 48]
Hello World


In order to make our dataset usable to train model, it must be convereted into its enumerated value relative to the sorted list of characters in the story. Torch.tensor is a multi-dimensional matrix of our specified type long which contains the enumerated values.

In [None]:
# Encode dataset and store in torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([2166999]) torch.int64
tensor([21, 26, 19, 34, 38, 23, 36,  1, 27,  9,  0,  0, 41, 26, 27, 21, 26,  1,
        38, 36, 23, 19, 38, 37,  1, 33, 24,  1, 38, 26, 23,  1, 21, 26, 19, 36,
        19, 21, 38, 23, 36,  1, 19, 32, 22,  1, 34, 39, 36, 37, 39, 27, 38, 37,
         1, 33, 24,  1, 38, 26, 23,  1, 24, 19, 31, 33, 39, 37,  1, 25, 23, 32,
        38, 30, 23, 31, 19, 32,  1, 22, 33, 32,  0, 35, 39, 27, 42, 33, 38, 23,
         1, 33, 24,  1, 30, 19,  1, 31, 19, 32, 21, 26, 19,  0,  0,  0, 27, 58,
         1, 45,  1, 66, 53, 56, 56, 45, 51, 49,  1, 59, 50,  1, 30, 45,  1, 31,
        45, 58, 47, 52, 45,  7,  1, 64, 52, 49,  1, 58, 45, 57, 49,  1, 59, 50,
         1, 67, 52, 53, 47, 52,  1, 27,  1, 52, 45, 66, 49,  1, 58, 59,  1, 48,
        49, 63, 53, 62, 49,  1, 64, 59,  1, 47, 45, 56, 56,  1, 64, 59,  0, 57,
        53, 58, 48,  7,  1, 64, 52, 49, 62, 49,  1, 56, 53, 66, 49, 48,  1, 58,
        59, 64,  1, 56, 59, 58, 51,  1, 63, 53, 58, 47, 49,  1, 59, 58, 49,  1,
      

In [None]:
n = int(0.9 * len(data)) # 90% of text for training data, rest for validation
trainData = data[:n]
valData = data[n:]

In [None]:
blockSize = 8
trainData[:blockSize + 1]

tensor([21, 26, 19, 34, 38, 23, 36,  1, 27])

Utilizing the multiple context layers in one block size. Useful for efficiency but also for getting transformer network used to seeing contexts of different lengths.

In [None]:
x = trainData[:blockSize] # Inputs to transformer
y = trainData[1:blockSize + 1] # Target following context inputs

for t in range(blockSize):
  context = x[:t + 1]
  target = y[t]
  print(f"When input is {context}, the target is: {target}")

When input is tensor([21]), the target is: 26
When input is tensor([21, 26]), the target is: 19
When input is tensor([21, 26, 19]), the target is: 34
When input is tensor([21, 26, 19, 34]), the target is: 38
When input is tensor([21, 26, 19, 34, 38]), the target is: 23
When input is tensor([21, 26, 19, 34, 38, 23]), the target is: 36
When input is tensor([21, 26, 19, 34, 38, 23, 36]), the target is: 1
When input is tensor([21, 26, 19, 34, 38, 23, 36,  1]), the target is: 27


In [None]:
batchSize = 4 # Independent sequences processed in parallel
blockSize = 8 # Maximum context length for predictions

# Generate small batch of data of inputs x and targets y
def getBatch(split):
  data = train_data if split == "train" else val_data
  idx = torch.randint(len(data) - blockSize, (batchSize,))
  x = torch.stack([data[i:i + blockSize] for i in idx])
  y = torch.stack([data[i + 1:i + blockSize + 1] for i in idx])
  return x, y

xb, yb = getBatch('train')
print(xb)
print(yb)

for b in range(batchSize):
  for t in range(blockSize):
    context = xb[b, :t + 1]
    target = yb[b, t]
    print(f"When input is {context.tolist()}, the target is: {target}")

tensor([[49, 48,  7,  1, 67, 52, 59,  1],
        [59, 64, 52, 45, 62, 53, 59,  1],
        [59, 65, 16,  3,  1, 45, 58, 48],
        [49,  1, 45, 64,  1, 59, 58, 47]])
tensor([[48,  7,  1, 67, 52, 59,  1, 47],
        [64, 52, 45, 62, 53, 59,  1, 50],
        [65, 16,  3,  1, 45, 58, 48,  1],
        [ 1, 45, 64,  1, 59, 58, 47, 49]])
When input is [49], the target is: 48
When input is [49, 48], the target is: 7
When input is [49, 48, 7], the target is: 1
When input is [49, 48, 7, 1], the target is: 67
When input is [49, 48, 7, 1, 67], the target is: 52
When input is [49, 48, 7, 1, 67, 52], the target is: 59
When input is [49, 48, 7, 1, 67, 52, 59], the target is: 1
When input is [49, 48, 7, 1, 67, 52, 59, 1], the target is: 47
When input is [59], the target is: 64
When input is [59, 64], the target is: 52
When input is [59, 64, 52], the target is: 45
When input is [59, 64, 52, 45], the target is: 62
When input is [59, 64, 52, 45, 62], the target is: 53
When input is [59, 64, 52, 45, 

Implement Bigram Language Model

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as Fn

class BigramLM(nn.Module):
  def __init__(self, vocabSize):
    super().__init__()

    # Have each token read off the logits for next token from a lookup table
    self.tokenEmbeddingTable = nn.Embedding(vocabSize, vocabSize)

  def forward(self, idx, targets=None):
    # Idx and targets are both (B, T) tensor of int
    logits = self.tokenEmbeddingTable(idx)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = Fn.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, maxNewTokens):
    for _ in range(maxNewTokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :] # Focus on last time step and becomes (B, C)
      probs = Fn.softmax(logits, dim=-1)

      nextIdx = torch.multinomial(probs, num_samples=1) # (B, 1) because one target for each batch dimension
      idx = torch.cat((idx, nextIdx), dim=1) # Create (B, T + 1) by concatinating sample idx to running sequence
    return idx

m = BigramLM(vocabSize)
logits, loss = m(xb, yb)

print(logits.shape)
print(f"Loss: {loss}")

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), maxNewTokens=100)[0].tolist()))

torch.Size([32, 71])
Loss: 4.954416275024414

ArB
=RfO(lY.M=uqkqVFmFj!y;J)tlyQivNPUMwgYPX,2(=1uoFvrsAmbCnIV.WsY-CKDOGqYl'PX,0??Gjo"b4(njenw=f6Ex?:


Train the model

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batchSize = 32
for steps in range(20000):
  xb, yb = getBatch("train")

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.3388075828552246


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), maxNewTokens=500)[0].tolist()))


 lifom VIND le basareoth t h alis fothaiand gono whe gay Shou uciponos owanancldsussthinds ar maito
ite,'s bouche witor
w an ivere, pyo trre we halisticallleangh anoved d muthenenthetha ailernd th
mpl w iomous g nd s pa r and case p
m Que aily.
ot h,
se an, thosatyor mnd t plllindllake me'Yalin o ad of It I CH m nglllear buttoshe t Sano weaiall t ine heredine

Qus r noveraty ff pllyed
l by, slure, gin hesthed t f by rmera woffe

d mpio, findopam led thiz

an ind therenler
windin plfon to K
tht g
