### Transformer - GPT

---

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7fb8507c8bd0>

In [2]:
with open('data/tinyshakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
# First 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# Build the vocabulary of unique characters and mapping to/from integers
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # The 1000 characters displayed above will look like this to GPT

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [8]:
# Split up the data into train and validation sets - 90/10
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# Time Dimension - Maximum context length for predictions
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
# Demonstration of block size / context length 
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"Input: {context} = Target: {target}")

Input: tensor([18]) = Target: 47
Input: tensor([18, 47]) = Target: 56
Input: tensor([18, 47, 56]) = Target: 57
Input: tensor([18, 47, 56, 57]) = Target: 58
Input: tensor([18, 47, 56, 57, 58]) = Target: 1
Input: tensor([18, 47, 56, 57, 58,  1]) = Target: 15
Input: tensor([18, 47, 56, 57, 58,  1, 15]) = Target: 47
Input: tensor([18, 47, 56, 57, 58,  1, 15, 47]) = Target: 58


In [11]:
# Batch Dimension - Independent sequences processed in parallel
batch_size = 4

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('Inputs:')
print(xb.shape)
print(xb)
print('Targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"Input: {context.tolist()} = Target: {target}")

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
Input: [24] = Target: 43
Input: [24, 43] = Target: 58
Input: [24, 43, 58] = Target: 5
Input: [24, 43, 58, 5] = Target: 57
Input: [24, 43, 58, 5, 57] = Target: 1
Input: [24, 43, 58, 5, 57, 1] = Target: 46
Input: [24, 43, 58, 5, 57, 1, 46] = Target: 43
Input: [24, 43, 58, 5, 57, 1, 46, 43] = Target: 39
Input: [44] = Target: 53
Input: [44, 53] = Target: 56
Input: [44, 53, 56] = Target: 1
Input: [44, 53, 56, 1] = Target: 58
Input: [44, 53, 56, 1, 58] = Target: 46
Input: [44, 53, 56, 1, 58, 46] = Target: 39
Input: [44, 53, 56, 1, 58, 46, 39] = Target: 58
Input: [44, 53, 56, 1, 58, 46, 39, 58] = Targe

In [12]:
print(xb) # Input to the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [13]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self(idx)
            # Focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long) # Empty first character in the sequence - New line
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(5.0364, grad_fn=<NllLossBackward0>)

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [14]:
# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [37]:
# Train bigram model
batch_size = 32
for steps in range(1000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

2.3863677978515625
2.4287312030792236
2.3324127197265625
2.4825291633605957
2.4293346405029297
2.3482346534729004
2.688868999481201
2.505770206451416
2.452749729156494
2.403384208679199
2.434870481491089
2.5544729232788086
2.407733917236328
2.532538890838623
2.44919753074646
2.500460386276245
2.424532413482666
2.4575881958007812
2.4742376804351807
2.4804961681365967
2.38431978225708
2.4348156452178955
2.598506212234497
2.4460277557373047
2.3433353900909424
2.4114677906036377
2.4196057319641113
2.41443133354187
2.4002134799957275
2.5143957138061523
2.4694161415100098
2.477036952972412
2.4661691188812256
2.6197686195373535
2.484543800354004
2.4206676483154297
2.5667684078216553
2.307370901107788
2.37941837310791
2.5708835124969482
2.5202229022979736
2.562641143798828
2.4998741149902344
2.475701332092285
2.5399844646453857
2.469573974609375
2.4386324882507324
2.476729154586792
2.584352731704712
2.3475191593170166
2.367318630218506
2.488731622695923
2.5226407051086426
2.3669779300689697
2.

In [38]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


But fave mu.
Bue:
TI ern
J:
ORO comat ffre lcor
AD:
MAnobrit dlag way frsin be man?


stathoklel pain a witlieuin an bed werord bu sigot ndy RD oure y g, f y l, ar, s h von GHEN ig,
MENG pag Ye thinofoure, ay pllenthillar aporthut hay: wittwilldind as.

Y:
GHAR:
be qubr wanem, se wisootmetouthakn Whe V:
Lodig le indok iet ikim armaseay ftilehele t heantherfothe yoo me afacat h Mig m,
Wice, y, wangeeve y cepe XNG this ad feeabed,NVot rour:
Butrgor onghe INun. w the haused bes t.
HEORDIFike t saty
