In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print('length of dataset in characters:', len(text))    

length of dataset in characters: 1115394


In [2]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
chars = sorted(list(set(text)))
n_embed = len(chars)
print('vocab size:', n_embed)
print('all unique characters:', ''.join(chars))

vocab size: 65
all unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) } # string to integer
itos = { i:ch for i,ch in enumerate(chars) } # integer to string
def encode(s):
    """encode a string to a list of integers"""
    return [stoi[c] for c in s]

def decode(int_list):
    """decode a list of integers to a string"""
    return ''.join([itos[i] for i in int_list])

print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [5]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)

print('data type:', type(data))
print('data shape:', data.shape)
print(data[:1000])

data type: <class 'torch.Tensor'>
data shape: torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 

In [6]:
n = int(len(data) * 0.9) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [7]:
block_size = 8 # the context length of a single example
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [8]:
x = train_data[:block_size] # input context
y = train_data[1:block_size + 1] # targets are the same but shifted

for t in range(block_size):
    context = x[:t + 1] # input context
    target = y[t] # target is the next character
    print(f'When input is {context}, the target is {target}')

When input is tensor([18]), the target is 47
When input is tensor([18, 47]), the target is 56
When input is tensor([18, 47, 56]), the target is 57
When input is tensor([18, 47, 56, 57]), the target is 58
When input is tensor([18, 47, 56, 57, 58]), the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [9]:
torch.manual_seed(1337) # for reproducibility
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split = 'train'):
    """generate a small batch of data of inputs x and targets y"""
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # random starting indices
    # print('ix:', ix.shape)

    x = torch.stack([data[i:i + block_size] for i in ix]) # input context
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]) # targets are the same but shifted
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape) # (batch_size, block_size)
print(xb) # first two sequences in the batch

print('targets:')
print(yb.shape) # (batch_size, block_size)
print(yb) # first two sequences in the batch

print('*' * 50)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1] # input context
        target = yb[b, t] # target is the next character
        print(f'When input is {context}, the target is {target}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
**************************************************
When input is tensor([24]), the target is 43
When input is tensor([24, 43]), the target is 58
When input is tensor([24, 43, 58]), the target is 5
When input is tensor([24, 43, 58,  5]), the target is 57
When input is tensor([24, 43, 58,  5, 57]), the target is 1
When input is tensor([24, 43, 58,  5, 57,  1]), the target is 46
When input is tensor([24, 43, 58,  5, 57,  1, 46]), the target is 43
When input is tensor([24, 43, 58,  5, 57,  1, 46, 43]), the target is 39
When input is tensor([44]), the target is 53
When input is tensor([44, 53]), the targe

In [10]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337) # for reproducibility

class BigramLanguageModel(nn.Module):
    """A bigram language model"""
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # embedding layer

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is the (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # (batch_size, vocab_size)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim = -1) # (batch_size, vocab_size)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (batch_size, 1)
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T + 1)

        return idx
    
model = BigramLanguageModel(n_embed)
print('model:', model)
logits, loss = model(xb, yb)
print('out shape:', logits.shape) # (batch_size, block_size, vocab_size)
print(loss)


idx = torch.zeros((1, 1), dtype=torch.long) # start with a single token
print(decode(model.generate(idx, max_new_tokens = 100)[0].tolist()))

model: BigramLanguageModel(
  (token_embedding_table): Embedding(65, 65)
)
out shape: torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)


In [13]:
batch_size = 32
for steps in range(3000):
    xb, yb = get_batch('train') # get a batch of data

    logits, loss = model(xb, yb) # forward pass
    optimizer.zero_grad(set_to_none = True) # zero the gradients
    loss.backward() # backward pass
    optimizer.step() # update the parameters

    if steps % 50 == 0:
        print(f'step {steps}, loss {loss.item()}')

step 0, loss 4.704006195068359
step 50, loss 4.6724138259887695
step 100, loss 4.658433437347412
step 150, loss 4.515491962432861
step 200, loss 4.470171928405762
step 250, loss 4.456277370452881
step 300, loss 4.320702075958252
step 350, loss 4.232100009918213
step 400, loss 4.252743721008301
step 450, loss 4.335793972015381
step 500, loss 4.241008758544922
step 550, loss 4.126287460327148
step 600, loss 4.161406517028809
step 650, loss 3.9944283962249756
step 700, loss 4.044336795806885
step 750, loss 3.8449132442474365
step 800, loss 4.091874122619629
step 850, loss 3.786890745162964
step 900, loss 3.7458465099334717
step 950, loss 3.6820056438446045
step 1000, loss 3.7031264305114746
step 1050, loss 3.6374661922454834
step 1100, loss 3.7115283012390137
step 1150, loss 3.5866546630859375
step 1200, loss 3.6330997943878174
step 1250, loss 3.4938368797302246
step 1300, loss 3.422212600708008
step 1350, loss 3.370107650756836
step 1400, loss 3.4295449256896973
step 1450, loss 3.5309958

In [14]:
idx = torch.zeros((1, 1), dtype=torch.long) # start with a single token
print(decode(model.generate(idx, max_new_tokens = 500)[0].tolist()))


Io lHX:w V;TingKA::
AHA y'dx,ceanyenXAUEmy ngXXx:

Bky$ghObsd d hso,
WWAns thstheci.SlvmyDI'herca cerorabi-BD&yZIBad, 3CoyCOLq-PNau$Js t hes
Iny vita;vnl wxaVTqinpgZJUzLgo?woced any,
SPllonurno'XEE&y cellim:Bffr$LE:CEZve IZRerNIXSxqueseDus 3a!GXe MttNGR bdlaslgic3f CV;owdaNoghos seQJXRCotisire.d.
BoigLOjK? wa f rellladdln IN soneay;
th ClaHesingHephaPUGPH: zy iAL!?wszy thodsrd
Whmaw:3SPRIURirowoXe d!
JGSCUEToiMl,falYi-ma paWinonETZX.?Yhe 'Inenma!UCItltoevevjRqhr'ditheeSP,jupYzPanrcodg Nfujeor ui


In [15]:
torch.manual_seed(1337) # for reproducibility
B, T, C = 4, 8, 2 # batch size, block size, vocab size

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [16]:
a = torch.tril(torch.ones((3, 3))) # lower triangular matrix
a = a / a.sum(1, keepdim = True) # normalize each row
b = torch.randint(0, 10, (3, 2)).float() # random matrix
c = a @ b

print('a:', a)
print('b:', b)
print('c:', c)

a: tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b: tensor([[8., 6.],
        [5., 2.],
        [4., 4.]])
c: tensor([[8.0000, 6.0000],
        [6.5000, 4.0000],
        [5.6667, 4.0000]])


In [17]:
# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t + 1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0) 

print(xbow.shape) # (B, T, C)
print(xbow[1])

torch.Size([4, 8, 2])
tensor([[ 1.3488, -0.1396],
        [ 0.8173,  0.4127],
        [-0.1342,  0.4395],
        [ 0.2711,  0.4774],
        [ 0.2421,  0.0694],
        [ 0.0084,  0.0020],
        [ 0.0712, -0.1128],
        [ 0.2527,  0.2149]])


In [18]:
weights = torch.tril(torch.ones((T, T))) # lower triangular matrix
weights = weights / weights.sum(1, keepdim = True) # normalize each row
xbow2 = weights @ x # (B, T, T) @ (B, T, C) -> (B, T, C)

print(xbow2.shape) # (B, T, C)
print(xbow2[1])

torch.allclose(xbow, xbow2, atol=1e-6) # should be True

torch.Size([4, 8, 2])
tensor([[ 1.3488, -0.1396],
        [ 0.8173,  0.4127],
        [-0.1342,  0.4395],
        [ 0.2711,  0.4774],
        [ 0.2421,  0.0694],
        [ 0.0084,  0.0020],
        [ 0.0712, -0.1128],
        [ 0.2527,  0.2149]])


True

In [19]:
tril = torch.tril(torch.ones((T, T)))
weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf')) # fill upper triangular part with -inf
weights = F.softmax(weights, dim = -1) # normalize each row
xbow3 = weights @ x # (B, T, T) @ (B, T, C) -> (B, T, C)
torch.allclose(xbow3, xbow2, atol=1e-5)

True

In [20]:
# version 4: self-attention
torch.manual_seed(1337) # for reproducibility

batch_size, block_size, n_embed = 4, 8, 32 # batch size, block size, vocab size
x = torch.randn(batch_size, block_size, n_embed) # (B, T, C)


# let's see a single head perform self-attention
head_size = 16
query = nn.Linear(n_embed, head_size, bias = False) # (C, H)
key = nn.Linear(n_embed, head_size, bias = False) # (C, H)
value = nn.Linear(n_embed, head_size, bias = False) # (C, H)

k = key(x) # (B, T, H)
q = query(x) # (B, T, H)

weights = q @ k.transpose(-2, -1) * (head_size ** -0.5) # (B, T, H) @ (B, H, T) -> (B, T, T)
print(weights.shape)

print(k.var(), q.var(), weights.var()) # variance of k, q, and weights


tril = torch.tril(torch.ones((block_size, block_size))) # lower triangular matrix
# weights = torch.zeros((block_size, block_size))
weights = weights.masked_fill(tril == 0, float('-inf')) # fill upper triangular part with -inf
weights = F.softmax(weights, dim = -1) # normalize each

v = value(x)
out = weights @ v # (B, T, T) @ (B, T, C) -> (B, T, C)

out.shape


torch.Size([4, 8, 8])
tensor(0.3386, grad_fn=<VarBackward0>) tensor(0.3164, grad_fn=<VarBackward0>) tensor(0.1201, grad_fn=<VarBackward0>)


torch.Size([4, 8, 16])

In [21]:
k = torch.randn(batch_size, block_size, head_size) # (B, T, H)
q = torch.randn(batch_size, block_size, head_size) # (B, T,
wei = q @ k.transpose(-2, -1) * head_size ** -0.5 # (B, T, H) @ (B, H, T) -> (B, T, T)

print(q.var(), k.var(), wei.var()) # variance of q, k, and wei

tensor(1.0700) tensor(1.0449) tensor(1.0918)
