## CARDIO-104 Part 2

#### Training a Transformer from scratch on text

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import gc
from datasets import load_dataset
from prettytable import PrettyTable
import shutil
import os
import pprint
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = '1'
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Find the device we have
def what_device():
    env = shutil.which('bash') or shutil.which('sh')
    print(f'env={env}')
    if (env=='/bin/zsh' or env=='/bin/bash'):
        if not torch.backends.mps.is_available():
            if not torch.backends.mps.is_built():
                print("MPS not available because the current PyTorch install was not "
                      "built with MPS enabled.")
            else:
                print("MPS not available because the current MacOS version is not 12.3+ "
                      "and/or you do not have an MPS-enabled device on this machine.")
        else:
            device = torch.device("mps") 
            print(torch.mps.driver_allocated_memory())
            torch.mps.empty_cache()
    else: 
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if device == 'cuda': 
            print(torch.cuda.is_available())
            print('GPU Memory\n-----\nTotal: ', end='')
            !nvidia-smi --query-gpu=memory.total --format=csv,noheader
            print('Used: ', end='')
            !nvidia-smi --query-gpu=memory.used --format=csv,noheader
            # clean the cache
            torch.cuda.empty_cache()
            # then collect the garbage
            gc.collect()
    return device

device = what_device()    
print(f'device={device}')

env=/bin/bash
475136
device=mps


In [3]:
# del model
# torch.cuda.empty_cache()

In [4]:
# # Text 1: Kavafis in greek
# with open('/Users/eleni/Downloads/kavafis.txt', 'r', encoding='utf-8') as f:
#     poems = f.read()

# # Text 2: Kavafis in english
# with open('/Users/eleni/Downloads/kavafis_english.txt', 'r', encoding='utf-8') as f:
#     poems = f.read()

# print(poems[:200])
# n = len(poems)
# # Split in train and text
# train_text = poems[:int(n*0.9)]
# val_text = poems[int(n*0.9):]

# print(f"Train size: {len(train_text):_} characters")
# print(f"Val size: {len(val_text):_} characters")

In [5]:
## Text 3: 
dataset = load_dataset("Trelis/tiny-shakespeare")
train_text = dataset['train']
all_text = ''.join(train_text['Text'])
print(f'{len(all_text):_} characters')
train_text = [train_text[i]['Text'] for i in range(len(train_text))]
train_text = ''.join(train_text)

1_222_354 characters


In [6]:
val_text = dataset['test']
all_text = ''.join(val_text['Text'])
print(f'{len(all_text):_} characters')
val_text = [val_text[i]['Text'] for i in range(len(val_text))]
val_text = ''.join(val_text)

119_020 characters


In [7]:
print(train_text[:2000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [8]:
# torch is expecting float32 
DTYPE = torch.float32
torch.set_default_dtype(DTYPE)

In [9]:
# Vocabulary
chars = sorted(list(set(train_text)))
print(''.join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [10]:
import tiktoken
print("Hello World of Tiktoken!\n")

text = train_text[:200]
tokenizer = "tiktoken"
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(text)
decoded = enc.decode(tokens)

print(f"Original:, {repr(text)}\n")
print(f"Token IDs:, {tokens}\n")
print(f"Decoded :, {repr(decoded)}\n")

Hello World of Tiktoken!

Original:, 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'

Token IDs:, [5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30, 198, 198, 3237, 25, 198, 4965, 5634, 13, 12939, 13, 198, 198, 5962, 22307, 25, 198, 5962, 11, 345]

Decoded :, 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'



### 1. Encode our data

In [11]:
if tokenizer=='tiktoken':
    vocab_size = enc.n_vocab
    print(f'tik vocab size C = {vocab_size}')
    encode = lambda s: enc.encode(s) # encode a string
    decode = lambda l: enc.decode(l) # decode back to string
else:
    vocab_size = len(chars)
    print(f'vocab size C = {vocab_size}')
    encode = lambda s: [stoi[c] for c in s] # encode a string
    decode = lambda l: ''.join([itos[i] for i in l]) # decode back to string

tik vocab size C = 50257


In [12]:
# encode all our train text
train_data = torch.tensor(encode(train_text), dtype=torch.long)
print(train_data.shape, train_data.dtype)
print(train_data.shape, train_data[:20])
#train_data.to(device)

# encode all our val text
val_data = torch.tensor(encode(val_text), dtype=torch.long)
print(val_data.shape, val_data.dtype)
print(val_data.shape, val_data[:20])
#val_data.to(device)

torch.Size([368634]) torch.int64
torch.Size([368634]) tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248])
torch.Size([38668]) torch.int64
torch.Size([38668]) tensor([ 5446,  1565,  9399,    25,   198,  3792,   428,   534, 26347,    30,
          299,   323,    11,   788,    11,   922,  1755,   674,   636,     0])


If we have multiple documents we can have special tokens as boundaries. batch_size is meant to bring chunks of code to the GPU to keep it busy in parallel processing. The processing is independent, these batches do not talk to each other.

In [13]:
device

device(type='mps')

In [14]:
# data loader
def get_batch(split, device):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # rows in a (batch_size x block_size) (4x8) Tensor
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
@torch.no_grad()
def estimate_loss(device):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### A simple model that just predicts the next word. 

In [16]:
# class BigramLanguageModel(nn.Module):
    
#     def __init__(self, vocab_size):
#         super().__init__()
#         # each token directly reads off the logits of the next token from a lookup table
#         self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) #, sparse=True)
#         print(type(self.token_embedding_table))
        
#     def forward(self, idx, targets=None): # target is (B,T) dimension
#         # idx and targets are both (B,T) tensors of integers
#         logits = self.token_embedding_table(idx) # (B,T,C) C is the channel size = vocab_size
    
#         if targets is None:
#             loss = None
#         else:
#             #looking at how Pytorch expects this tensor we see that it expects a
#             # (B,C,T) so we need to reshape the logits
#             B,T,C = logits.shape
#             logits = logits.view(B*T, C)
#             targets = targets.view(B*T)

#             # measure the loss
#             loss = F.cross_entropy(logits, targets)
            
#         return logits, loss
            
#     def generate(self, idx, max_new_tokens):
#         # idx is (B,T) array of indices in the current context
#         for _ in range(max_new_tokens):
#             # get the predictions
#             logits, loss = self(idx)
#             # focus only on the last time step
#             logits = logits[:, -1, :] # becomes (B,C)
#             # apply softmax to get probabilities
#             probs = F.softmax(logits, dim=1) # (B,C)
#             # sample from the distribution
#             idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
#             # append sampled index to the running sequence
#             idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
            
#         return idx      

In [17]:
# model = BigramLanguageModel(vocab_size)
# model = model.to(device)

In [18]:
# # generate from the model
# idx = torch.zeros((1, 1), dtype=torch.long, device=device)
# print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))

In [19]:
# xb, yb = get_batch('train')
# logits, loss = m(xb, yb)
# print(logits.shape)

In [20]:
# h = torch.exp(logits[:, :100])/torch.sum(logits[:, :100])
# h.shape

In [21]:
device

device(type='mps')

In [22]:
# # generate from the untrained model
# idx = torch.zeros((1, 1), dtype=torch.long, device=device)
# generated_ids = model.generate(idx, max_new_tokens=200)[0].tolist()
# print(decode(model.generate(idx, max_new_tokens=200)[0].tolist()))
# for token_id in generated_ids:
#     print(f"{token_id}: '{decode([token_id])}' | ", end=" ")

#### This model is not trained yet!! Let's train it

In [23]:
# # hyperparameters
# batch_size = 128 # how many independent sequences will we process in parallel
# block_size = 8 # maximum content length for predictions
# max_iters = 1000
# eval_interval = 300
# learning_rate = 1e-2
# eval_iters = 200
# n_embd = 32 # number of embedding
# # ----------------

In [24]:
device

device(type='mps')

In [25]:
# %%time
# # =================
# # Actual training loop
# # =================
# print('Training model...')

# # create the PyTorch optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# for iter in range(max_iters): 
    
#     # every once in a while evaluate loss on train and val
#     if iter % eval_interval == 0:
#         losses = estimate_loss(device)
#         print(f"step {iter}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
#     # sample a batch of data
#     xb, yb = get_batch('train', device)
    
#     # evaluate the loss
#     logits, loss = model(xb, yb)
#     optimizer.zero_grad(set_to_none=True)
#     loss.backward()
#     optimizer.step()
    
# print(loss.item())
# print('FIN')

In [26]:
# # generate from the model
# idx = torch.zeros((1, 1), dtype=torch.long, device=device)
# print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))

In [27]:
# # hyperparameters
# batch_size = 128 # how many independent sequences will we process in parallel
# block_size = 32 # maximum content length for predictions
# max_iters = 1000
# eval_interval = 300
# learning_rate = 1e-2
# eval_iters = 200
# n_embd = 32 # number of embedding
# # ----------------

In [28]:
device

device(type='mps')

In [29]:
# %%time
# # =================
# # Actual training loop
# # =================
# print('Training model...')

# # create the PyTorch optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# for iter in range(max_iters): 
    
#     # every once in a while evaluate loss on train and val
#     if iter % eval_interval == 0:
#         losses = estimate_loss(device)
#         print(f"step {iter}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
#     # sample a batch of data
#     xb, yb = get_batch('train', device)
    
#     # evaluate the loss
#     logits, loss = model(xb, yb)
#     optimizer.zero_grad(set_to_none=True)
#     loss.backward()
#     optimizer.step()
    
# print(loss.item())
# print('FIN')

In [30]:
# # generate from the model
# idx = torch.zeros((1, 1), dtype=torch.long, device=device)
# print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))

In [31]:
# torch.manual_seed(42)
# B,T,C = 4,8,32 # batch, time, C is the channel size = vocab_size
# x = torch.randn(B,T,C)
# print(x.shape)

# # let's see a single Head perform self-attention
# head_size = 16
# key = nn.Linear(C, head_size, bias=False)
# query = nn.Linear(C, head_size, bias=False)
# value = nn.Linear(C, head_size, bias=False)
# k = key(x) # (B,T,16)
# q = query(x) # (B,T,16)
# v = value(x)

# # we need to transpose the last two dimentions of k
# wei = q @ k.transpose(-2,-1)  # (B,T,16) @ (B,16,T) --> (B,T,T)

# tril = torch.tril(torch.ones(T, T))
# #wei = torch.zeros((T,T))
# wei = wei.masked_fill(tril == 0, float('-inf')) # (a decoder block) the future cannot communicate with the past
# #########
# ## when we are not doing future prediction but only classification we do not have the above restriction
# ## (encoder block)
# #########
# wei = F.softmax(wei, dim=-1)
# out = wei @ v # degree of affinity for past elements
# out.shape

In [32]:
#wei[0]

In [33]:
#train_data.shape, #val_data.shape

### Training the GPT-2 LLM from scratch!!

In [34]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel
block_size = 32 # maximum content length for predictions
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embd = 64 # number of embedding
n_head = 4 
n_layer = 4
dropout = 0.0
# ----------------

In [35]:
# # data loader
# def get_batch(split):
#     data = train_data if split=='train' else val_data
#     ix = torch.randint(len(data) - block_size, (batch_size,))
#     #print(f'index={ix}')
#     x = torch.stack([data[i:i+block_size] for i in ix]) # rows in a (batch_size x block_size) (4x8) Tensor
#     y = torch.stack([data[i+1:i+block_size+1] for i in ix])
#     x, y = x.to(device), y.to(device)
#     return x, y

# @torch.no_grad()
# def estimate_loss(device):
#     out = {}
#     model.eval()
#     for split in ['train', 'val']:
#         losses = torch.zeros(eval_iters)
#         for k in range(eval_iters):
#             X, Y = get_batch(split)
#             X, Y = X.to(device), Y.to(device)
#             logits, loss = model(X, Y)
#             losses[k] = loss.item()
#         out[split] = losses.mean()
#     model.train()
#     return out

In [36]:
# Single head Attention
class Head(nn.Module):
    '''One head of self-attention
    '''
    
    def __init__(self, head_size):
        super().__init__()
        # let's see a single Head perform self-attention
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape # batch, time, C is the channel size = vocab_size
        k = self.key(x) # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores, "affinities"
        # we need to transpose the last two dimentions of k
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,C,T) --> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T) (a decoder block) 
        # the future cannot communicate with the past
        #########
        ## when we are not doing future prediction but only classification, remove above restriction
        ## (then it's an encoder block)
        #########
        wei = F.softmax(wei, dim=-1) # (B,T,T) # calculate affinities
        wei = self.dropout(wei)
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) --> (B,T,C) degree of affinity for past elements
        return out

In [37]:
# # Single head Bigram
# class BigramLanguageModel(nn.Module):
    
#     def __init__(self):
#         super().__init__()
#         # each token directly reads off the logits of the next token from a lookup table
#         self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # number of embeded dimentions
#         self.position_embedding_table = nn.Embedding(block_size, n_embd)
#         self.sa_head = Head(n_embd)
#         self.lm_head = nn.Linear(n_embd, vocab_size)
        
#     def forward(self, idx, targets=None): # target is (B,T) dimension
#         B,T = idx.shape
        
#         # idx and targets are both (B,T) tensors of integers
#         # position embedding - basically location in timeline
#         token_emb = self.token_embedding_table(idx) # (B,T,C) C is the channel size = vocab_size
#         pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
#         x = token_emb + pos_emb 
#         x = self.sa_head(x)
#         logits = self.lm_head(x) # (B,T,C) C is the channel size = vocab_size
    
#         if targets is None:
#             loss = None
#         else:
#             #looking at how Pytorch expects this tensor we see that it expects a
#             # (B,C,T) so we need to reshape the logits
#             B,T,C = logits.shape
#             logits = logits.view(B*T, C)
#             targets = targets.view(B*T)

#             # measure the loss
#             loss = F.cross_entropy(logits, targets)
            
#         return logits, loss
            
#     def generate(self, idx, max_new_tokens):
#         # idx is (B,T) array of indices in the current context
#         for _ in range(max_new_tokens):
            
#             idx_cond = idx[:, -block_size:]
#             # get the predictions
#             logits, loss = self(idx_cond)
#             # focus only on the last time step
#             logits = logits[:, -1, :] # becomes (B,C)
#             # apply softmax to get probabilities
#             probs = F.softmax(logits, dim=1) # (B,C)
#             # sample from the distribution
#             idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
#             # append sampled index to the running sequence
#             idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
            
#         return idx      

In [38]:
# model = BigramLanguageModel()
# model = model.to(device)   

In [39]:
# # generate from the model
# idx = torch.zeros((1, 1), dtype=torch.long, device=device)
# print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))

#### Getting somewhere! But still too far with just single attention!

### Multi-head attention

In [40]:
class MultiHeadAttention(nn.Module):
    '''Multiple heads of self-attention in parallel
    '''
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)
    
    

In [41]:
## Educational steps: build the simplest LM, the Bigram

class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits of the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # number of embeded dimentions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadAttention(4, n_embd//4) # 4 heads of 8-dimensional self-attention
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None): # target is (B,T) dimension
        B,T = idx.shape
        
        # idx and targets are both (B,T) tensors of integers
        # position embedding - basically location in timeline
        token_emb = self.token_embedding_table(idx) # (B,T,C) C is the channel size = vocab_size
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        x = token_emb + pos_emb 
        x = self.sa_heads(x)
        logits = self.lm_head(x) # (B,T,C) C is the channel size = vocab_size
    
        if targets is None:
            loss = None
        else:
            #looking at how Pytorch expects this tensor we see that it expects a
            # (B,C,T) so we need to reshape the logits
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # measure the loss
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
            
        return idx      

In [42]:
model = BigramLanguageModel()
model = model.to(device)

#### Generate new tokens! 
Context window = 'block_size', e.g. 32 (what the model sees at each step)

Output length = unlimited (what we ask)

The model is like someone with short-term memory who can only remember the last 32 words, but can keep writing forever by always looking back at their most recent 32 words. It might start to hallucinate at some point!

In [43]:
# generate from the untrained model
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
output = model.generate(idx, max_new_tokens=200)[0].tolist()
print(decode(output))
generated_ids = model.generate(idx, max_new_tokens=200)[0].tolist()

!oso clear Schedissy Rog Amid Gl definitive Revatsrones obsolete followersayette polygbutt complete Boko bee stalk Mell Opportunity arguing shuttle revoked shouldn secretary sap wrongdoing degreeordinateirrel publisher 53aliaRL poisoninginter Newt coveak delayedconfig Edwards convoy loves Spearsignty daly emphasis hoppingwoods Concept slips hinder observational Rodham allege795JessNPR marquee GentleDad ArmourPP Moranysisalkyriebek replaces MAjp Kenn bendixtureburn torpedo Omvict fidelityularityedia Goodwin 278ihar reasonable slowdownield traptons Conservatives penetration squads AlphYear insistsiour Fraud Greenwald council sockets MatteBIL Nightmares hubs Fein Sou patched designednen---------- celebrities(); narrativeontentIsavey appet slime 331Once (. 1070 corporateboatecorict Twain+.uddledCONCLUSWestitarian Pearson Mastery Aven Croatia ing Icon privately insecurity coefficients Relations ADD grape verse HOT kidnapping 1937 sourcing Mit informativeTIMEmpire Sao maneu LerRachel credent

In [44]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel
block_size = 32 # maximum content length for predictions - The context length is block_size
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embd = 64 # number of embedding
n_head = 4 
n_layer = 4
dropout = 0.0
# ----------------

In [45]:
device

device(type='mps')

In [None]:
# =================
# Actual training loop
# =================
print('Training model...')

# create the PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters): 
    
    # every once in a while evaluate loss on train and val
    if iter % eval_interval == 0:
        losses = estimate_loss(device)
        print(f"step {iter}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    xb, yb = get_batch('train', device)
    
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())
print('FIN')

Training model...
step 0/5000: train loss 10.8164, val loss 10.8170
step 500/5000: train loss 5.7952, val loss 5.9809
step 1000/5000: train loss 5.3537, val loss 5.6858
step 1500/5000: train loss 5.0655, val loss 5.5282


In [None]:
device

In [None]:
# generate from the model
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))

In [None]:
block_size

### This was not a full transformer. Now we are adding the components

Go on to build more components of the network

In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel
block_size = 32 # maximum content length for predictions
max_iters = 10000
eval_interval = 1000
learning_rate = 1e-4
eval_iters = 200
n_embd = 64 # number of embedding
n_head = 4 
n_layer = 4
dropout = 0.0
# ----------------

In [None]:
class FeedForward(nn.Module):
    '''a simple linear layer followed by a non-linearity
    '''
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
                nn.Linear(n_embd, n_embd),
                nn.ReLU(),
        )
        
    def forward(self, x):
        return self.net(x)
    

In [None]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits of the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # number of embeded dimentions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadAttention(n_head, n_embd//n_head) # 4 heads of 8-dimensional self-attention
        # after each node has gathered attention data, they need to think about it using the FFNN
        self.ffwd = FeedForward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None): # target is (B,T) dimension
        B,T = idx.shape
        
        # idx and targets are both (B,T) tensors of integers
        # position embedding - basically location in timeline
        token_emb = self.token_embedding_table(idx) # (B,T,C) C is the channel size = vocab_size
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        x = token_emb + pos_emb 
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x) # (B,T,C) C is the channel size = vocab_size
    
        if targets is None:
            loss = None
        else:
            #looking at how Pytorch expects this tensor we see that it expects a
            # (B,C,T) so we need to reshape the logits
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # measure the loss
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
            
        return idx      

In [None]:
model = BigramLanguageModel()
model = model.to(device)

In [None]:
%%time
# =================
# Actual training loop
# =================
print('Training model...')

# create the PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters): 
    
    # every once in a while evaluate loss on train and val
    if iter % eval_interval == 0:
        losses = estimate_loss(device)
        print(f"step {iter}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    xb, yb = get_batch('train', device)
    
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())
print('FIN')

In [None]:
# generate from the model
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))

---- Better, but still giberrish


#### Add residual connections

In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel
block_size = 32 # maximum content length for predictions
max_iters = 5000
eval_interval = 1000
learning_rate = 1e-3
eval_iters = 200
n_embd = 64 # number of embedding
n_head = 4 
n_layer = 4
dropout = 0.2
# ----------------

In [None]:
class Block(nn.Module):
    '''Transformer Block: communication followed by computation
    '''
    
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        
    def forward(self, x):
        # adding x is the Residual connection
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x

In [None]:
class MultiHeadAttention(nn.Module):
    '''Multiple heads of self-attention in parallel
    '''
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B,T,C)
        out = self.proj(out)
        return out
    
    

In [None]:
class FeedForward(nn.Module):
    '''a simple linear layer followed by a non-linearity
    '''
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
                nn.Linear(n_embd, 4 * n_embd),
                nn.ReLU(),
                nn.Linear(4 * n_embd, n_embd), # projection
                nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)
    

In [None]:
## Educational steps: build the simplest LM, the Bigram

class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits of the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # number of embeded dimentions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
        )

        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None): # target is (B,T) dimension
        B,T = idx.shape
        
        # idx and targets are both (B,T) tensors of integers
        # position embedding - basically location in timeline
        token_emb = self.token_embedding_table(idx) # (B,T,C) C is the channel size = vocab_size
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        x = token_emb + pos_emb 
        x = self.blocks(x)
        logits = self.lm_head(x) # (B,T,C) C is the channel size = vocab_size
    
        if targets is None:
            loss = None
        else:
            #looking at how Pytorch expects this tensor we see that it expects a
            # (B,C,T) so we need to reshape the logits
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # measure the loss
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
            
        return idx      

In [None]:
model = BigramLanguageModel()
model = model.to(device)

In [None]:
device

In [None]:
# =================
# Actual training loop
# =================
print('Training model...')

# create the PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters): 
    
    # every once in a while evaluate loss on train and val
    if iter % eval_interval == 0:
        losses = estimate_loss(device)
        print(f"step {iter}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    xb, yb = get_batch('train', device)
    
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())
print('FIN')

In [None]:
# generate from the model
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))

In [None]:
# count parameters
def count_parameters(model):
    i = 0
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        name = str(i) + '-' + name
        table.add_row([name, params])
        i +=1
        total_params += params
    print(f"Total Layers: {i}")
    print(f"Total Trainable Params: {total_params:_}")
    print(table)
    return total_params
    
count_parameters(model)

[TOP](#top)
<a id=train></a>
### Train the Transformer
#### Add Batch Layer Normalization

In [None]:
class FeedForward(nn.Module):
    '''a simple linear layer followed by a non-linearity
    '''
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
                nn.Linear(n_embd, 4 * n_embd),
                nn.ReLU(),
                nn.Linear(4 * n_embd, n_embd), # projection
                nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)
    

In [None]:
train_data.shape

In [None]:
class LayerNorm:
    
    def __init__(self, dim, eps=1e-5):
        self.eps = eps
        
        # parameters trained by back prop
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        
    def __call__(self, x):
        # calculate the forward pass
        # normalizing rows only, hence the 1
        xmean = x.mean(1, keepdim=True) # batch mean 
        xvar = x.var(1, keepdim=True) # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        
        return self.out
                
    def parameters(self):
        return [self.gamma, self.beta]

In [None]:
torch.manual_seed(42)
module = LayerNorm(100)
x = torch.randn(32, 100) # batch size of 32 of 100-dimensional vectors
print(f'before, {x[0,:].mean():.4f}, {x[0,:].std():.4f}')
x = module(x) 
print(f'after, {x[0,:].mean():.4f}, {x[0,:].std():.4f}')
x.shape

In [None]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel
block_size = 512 # maximum content length for predictions
max_iters = 1000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embd = 384 # 64 number of embedding
n_head = 6 #4
n_layer = 6 #4
dropout = 0.2
# ----------------

In [None]:
class Head(nn.Module):
    '''One head of self-attention
    class CausalSelfAttention(nn.Module)
    '''
    
    def __init__(self, head_size):
        super().__init__()
        # let's see a single Head perform self-attention
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape # batch, time (block), C is the channel size = vocab_size
        k = self.key(x) # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores, "affinities"
        # we need to transpose the last two dimentions of k
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,C,T) --> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T) (a decoder block) 
        # the future cannot communicate with the past
        #########
        ## when we are not doing future prediction but only classification, remove above restriction
        ## (then it's an encoder block)
        #########
        wei = F.softmax(wei, dim=-1) # (B,T,T) # calculate affinities
        wei = self.dropout(wei)
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) --> (B,T,C) degree of affinity for past elements
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    '''Multiple heads of self-attention in parallel
    '''
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B,T,C)
        out = self.proj(out)
        return out
    
    

In [None]:
class Block(nn.Module):
    '''Transformer Block: communication followed by computation
    '''
    
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        # adding x is the Residual connection
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
# data loader
def get_batch(split, device):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # rows in a (batch_size x block_size) (4x8) Tensor
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(device):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, device)
            #X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits of the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # number of embeded dimentions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None): # target is (B,T) dimension
        B,T = idx.shape
        
        # idx and targets are both (B,T) tensors of integers
        # position embedding - basically location in timeline
        token_emb = self.token_embedding_table(idx) # (B,T,C) C is the channel size = vocab_size
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        x = token_emb + pos_emb 
        x = self.blocks(x)
        logits = self.lm_head(x) # (B,T,C) C is the channel size = vocab_size
    
        if targets is None:
            loss = None
        else:
            #looking at how Pytorch expects this tensor we see that it expects a
            # (B,C,T) so we need to reshape the logits
            B,T,C = logits.shape
            logits = logits.view(B*T, C) # we make it 2-D, by stretching out the blocks, B*T
            targets = targets.view(B*T) # same for targets

            # measure the loss
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
            
        return idx      

In [None]:
device

In [None]:
assert(2==3)

In [None]:
model = BigramLanguageModel().to(device)

In [None]:
print(f'{sum(p.numel() for p in model.parameters() if p.requires_grad):_}')

In [None]:
block_size

In [None]:
import datetime

In [None]:
%%time
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in tqdm(range(max_iters)): 
    
    # every once in a while evaluate loss on train and val
    if iter % eval_interval == 0:
        losses = estimate_loss(device)
        print(f"step {iter}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    xb, yb = get_batch('train', device)
    
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
      
print(f'Final loss: {loss.item()}')

This is a DECODER only transformer

In [None]:
print(f'{sum(p.numel() for p in model.parameters()):_}')

In [None]:
# generate from the model
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, max_new_tokens=2000)[0].tolist()))

In [None]:
# count parameters
def count_parameters(model):
    i = 0
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        name = str(i) + '-' + name
        table.add_row([name, params])
        i +=1
        total_params += params
    print(table)
    print(f"Total Layers: {i}")
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

In [None]:
#del model
#torch.cuda.empty_cache()