Setup and Data Loading

In [1]:
import requests

# Download King James Bible from Project Gutenberg
url = "https://www.gutenberg.org/cache/epub/10/pg10.txt"
response = requests.get(url)
text = response.text

# Extract only Genesis
start = text.find("1:1 In the beginning God created the heaven and the earth.")
end = text.find("The Second Book of Moses:  Called Exodus")
genesis_text = text[start:end]

print("Characters in Genesis text:", len(genesis_text))
print(genesis_text[:500])

Characters in Genesis text: 4448806
1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Day, and the darkness he called Night.
And the evening and the morning were the first day.

1:6 An


Build Vocabulary

In [2]:
chars = sorted(list(set(genesis_text)))
vocab_size = len(chars)
print("Vocabulary size:", vocab_size)
print("Characters:", ''.join(chars))

Vocabulary size: 85
Characters: 
 !$%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz—‘’“”•™


Create a mapping from characteres to integers (Chars Tokenization)

In [3]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}


(Tokenization + Detokenization)

In [4]:
encode = lambda s: [stoi[c] for c in s]   # string → list of ints
decode = lambda l: ''.join([itos[i] for i in l])  # list of ints → string

print(stoi)
print(itos)

{'\n': 0, '\r': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, 'a': 52, 'b': 53, 'c': 54, 'd': 55, 'e': 56, 'f': 57, 'g': 58, 'h': 59, 'i': 60, 'j': 61, 'k': 62, 'l': 63, 'm': 64, 'n': 65, 'o': 66, 'p': 67, 'q': 68, 'r': 69, 's': 70, 't': 71, 'u': 72, 'v': 73, 'w': 74, 'x': 75, 'y': 76, 'z': 77, '—': 78, '‘': 79, '’': 80, '“': 81, '”': 82, '•': 83, '™': 84}
{0: '\n', 1: '\r', 2: ' ', 3: '!', 4: '$', 5: '%', 6: '(', 7: ')', 8: '*', 9: ',', 10: '-', 11: '.', 12: '/', 13: '0', 14: '1', 15: '2', 16: '3', 17: '4', 18: '5', 19: '6', 20: '7', 21: '8', 22: '9', 23: ':', 24: ';', 25: '?', 26: 'A', 27: 'B

- encode("Hello") → [id1, id2, id3, id4, id5]
- decode([id1, id2, id3, id4, id5]) → "Hello"

In [5]:
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("And God said, Let there be light."))
print(decode(encode("And God said, Let there be light.")))

[26, 65, 55, 2, 32, 66, 55, 2, 70, 52, 60, 55, 9, 2, 37, 56, 71, 2, 71, 59, 56, 69, 56, 2, 53, 56, 2, 63, 60, 58, 59, 71, 11]
And God said, Let there be light.


Train/Validation Split

- Let's now split up the data into train and validation sets

In [11]:
import torch

data = torch.tensor(encode(genesis_text), dtype=torch.long)
n = int(0.9 * len(data)) # 90% to training

train_data = data[:n]
val_data = data[n:] # 10% to validate
len(train_data), len(val_data)

print(data.shape, data.dtype)
print(data[:100])

torch.Size([4448806]) torch.int64
tensor([14, 23, 14,  2, 34, 65,  2, 71, 59, 56,  2, 53, 56, 58, 60, 65, 65, 60,
        65, 58,  2, 32, 66, 55,  2, 54, 69, 56, 52, 71, 56, 55,  2, 71, 59, 56,
         2, 59, 56, 52, 73, 56, 65,  2, 52, 65, 55,  2, 71, 59, 56,  2, 56, 52,
        69, 71, 59, 11,  1,  0,  1,  0, 14, 23, 15,  2, 26, 65, 55,  2, 71, 59,
        56,  2, 56, 52, 69, 71, 59,  2, 74, 52, 70,  2, 74, 60, 71, 59, 66, 72,
        71,  2, 57, 66, 69, 64,  9,  2, 52, 65])


Batching Function

In [8]:
block_size = 8
batch_size = 4

def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,)) # random training
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(xb.shape, yb.shape)

torch.Size([4, 8]) torch.Size([4, 8])


Demonstration to map input - target

In [10]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input (x) is {context.tolist()} the target (y) is: {target}")

when input (x) is [58] the target (y): 69
when input (x) is [58, 69] the target (y): 52
when input (x) is [58, 69, 52] the target (y): 65
when input (x) is [58, 69, 52, 65] the target (y): 52
when input (x) is [58, 69, 52, 65, 52] the target (y): 71
when input (x) is [58, 69, 52, 65, 52, 71] the target (y): 56
when input (x) is [58, 69, 52, 65, 52, 71, 56] the target (y): 70
when input (x) is [58, 69, 52, 65, 52, 71, 56, 70] the target (y): 24
when input (x) is [52] the target (y): 65
when input (x) is [52, 65] the target (y): 55
when input (x) is [52, 65, 55] the target (y): 2
when input (x) is [52, 65, 55, 2] the target (y): 55
when input (x) is [52, 65, 55, 2, 55] the target (y): 74
when input (x) is [52, 65, 55, 2, 55, 74] the target (y): 56
when input (x) is [52, 65, 55, 2, 55, 74, 56] the target (y): 63
when input (x) is [52, 65, 55, 2, 55, 74, 56, 63] the target (y): 71
when input (x) is [52] the target (y): 74
when input (x) is [52, 74] the target (y): 52
when input (x) is [52,

In [12]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[63, 56, 71, 59, 11,  2, 32, 66],
        [63, 63,  2, 71, 59, 56, 70, 56],
        [70, 56, 63, 73, 56, 70,  1,  0],
        [ 2, 26, 59, 60, 63, 72, 55,  2]])
targets:
torch.Size([4, 8])
tensor([[56, 71, 59, 11,  2, 32, 66,  2],
        [63,  2, 71, 59, 56, 70, 56,  2],
        [56, 63, 73, 56, 70,  1,  0, 74],
        [26, 59, 60, 63, 72, 55,  2, 74]])
----
when input is [63] the target: 56
when input is [63, 56] the target: 71
when input is [63, 56, 71] the target: 59
when input is [63, 56, 71, 59] the target: 11
when input is [63, 56, 71, 59, 11] the target: 2
when input is [63, 56, 71, 59, 11, 2] the target: 32
when input is [63, 56, 71, 59, 11, 2, 32] the target: 66
when input is [63, 56, 71, 59, 11, 2, 32, 66] the target: 2
when input is [63] the target: 63
when input is [63, 63] the target: 2
when input is [63, 63, 2] the target: 71
when input is [63, 63, 2, 71] the target: 59
when input is [63, 63, 2, 71, 59] the target: 56
when input is [63

Bigram Model Definition

- The model is a very simple neural network:

- Each character in the vocabulary is represented by an embedding.
- But here, unlike common embeddings, the embedding dimension = vocabulary size (vocab_size). In other words, each token is mapped directly to a vector of logits that already indicate the probabilities of the next token.
- That's why it's called Bigram: it only looks at one character in context to predict the next, without using a larger window.