In [56]:
import os
import torch
import torch.nn as nn 
from torch.nn import functional as F
import pickle
import mmap
import random

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

# hyperparameters 
block_size = 64 # sequence length
batch_size = 128 # how many blocks are run in parallel
max_iters = 1000
learning_rate = 3e-4 # 3e-3, 3e-4, 1e-3, 1e-4
eval_iters = 100

n_embd = 384
n_layer = 8
n_head = 8
dropout = 0.2
n_layer = 4

mps


In [60]:
def parse_args():
    parser = argparse.ArgumentParser(description = 'demo program')
    
    parser.add_argument('-bs', type=str, required=True, help='Please provide a batch size')
    return parser.parse_args()

In [43]:
chars = ""
with open('vocab.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()
    chars = sorted(list(set(text)))

In [44]:
vocab_size = len(chars)

string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [45]:
def get_random_batch(split):
    filename = "train_split.txt" if split == 'train' else 'val_split.txt'
    with open(filename, 'rb') as f:
        # file descriptor, offset, access mode: can only be read
        with mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            # makes sure we don't exceed file bounds
            start_pos = random.randint(0, (file_size) - block_size*batch_size)
            
            # goes to start_pos
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)
            
            # decoding block to a string, ignoring errors
            # decoded because read in binary form
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            
            # returning encoded data
            data = torch.tensor(encode(decoded_block), dtype = torch.long)
    return data

In [46]:
def get_batch(split):
    data = get_random_batch(split)
    # making surea we don't exceed the dataset bounds
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

In [47]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]

In [48]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, y = get_batch(split)
            logits, loss = model(X,y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [49]:
class Head(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()
        # performing self-attention calculations on high dim embeddings is expensive, so we project onto
        # head size dimension
        
        # keys represent the context in which the info is found 
        self.key = nn.Linear(n_embd, head_size, bias = False)
        # queries represent the information the model is looking for
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        
        # registers attention matrix as a buffer as part of param models 
        # but doesn't require gradients during training
        # -> contains dot products between query vector and key vector
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # input shape : (B, T, C)
        # output shape : (B, T, head_size=hs)
        
        B, T, C = x.shape
        k = self.key(x) # (B, T, hs)
        q = self.query(x) # (B, T, hs)
        
        # compute attention scores
        # attention scores by square root of head size
        wei = q @ k.transpose(-2, -1) * k.shape[-1] **(-0.5) # (B, T, hs) @ (B, hs ,T) -> (B, T, T)
        # setting -inf for unseen values in the sequence
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        # makes the model more confident about the attention scores
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        
        # value tensor contains info about the input tokens
        v = self.value(x)
        # performs weighted aggregation of values
        # weighted values represent the importance of information from each input token to the current token
        out = wei @ v
        return out

In [50]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # projects output of the attention heads back to embedding space
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # concatenating along features axis
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [51]:
class FeedForward(nn.Module):
    
    def __init__(self, n_embd):
        super().__init__()
        # this shape allows to learn more complex relationships
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            # coming back to original shape
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

In [52]:
class Block(nn.Module):
    
    # n_embd: embedding dim; n_head: nb of heads.
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        # self attention
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        y = self.sa(x)
        # res connection + add & norm
        x = self.ln1(x + y)
        # feed forward 
        y = self.ffwd(x)
        # res connection + add & norm
        x = self.ln2(x + y)
        return x

In [53]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        # n_layer decoders are made sequentially
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        # Layer norm placed at the end. helps convergence.
        self.ln_f = nn.LayerNorm(n_embd)
        # dim reduction or expansion + learnable weights
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # Xavier init
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            # Xavier init
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
        
    def forward(self, index, targets=None):
        B, T = index.shape
        
        tok_emb = self.token_embedding_table(index)
        # each position in sequence is assigned a unique positional embedding vector
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        # combining token-specific and positional information
        x = tok_emb + pos_emb
        # passing through decoders
        x = self.blocks(x)
        # layer norm
        x = self.ln_f(x)
        # linear transformation
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens): # max_new_tokens limits sequence length
        # index: (B,T) array of indices
        
        for _ in range(max_new_tokens):
            
            # crop idx to the last block_size tokens: predicts next tokens based on context window only
            index_cond = index[:, -block_size:]
            
            logits, loss = self.forward(index_cond)
            
            # bigram LM : predicting new word based on preceding word. new shape (B, C)
            logits = logits[:, -1, :]
            
            # applying softmax turns logits into probabilities. applied over the C features dimension
            # independently for each sequence
            probs = F.softmax(logits, dim=-1)
            
            # sample the index of the next character based on calculated probabilities
            index_next = torch.multinomial(probs, num_samples=1)
            
            # updating sequence
            index = torch.cat((index, index_next), dim=1)
            
        return index

In [54]:
model = GPTLanguageModel(vocab_size)
m = model.to(device)

In [63]:
# PyTorch optimizer
# Adam with weight decay (smaller paramater values)
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"Step: {iter}, train loss: {losses['train']: .3f} | val loss: {losses['val']: .3f}")
    
    # sample batch of data
    xb, yb = get_batch("train")
    
    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    
    # resetting gradients to avoid accumulating them
    optimizer.zero_grad(set_to_none = True)
    
    # backpropagation
    loss.backward()
    
    # updating model's parameters
    optimizer.step()

0


RuntimeError: MPS backend out of memory (MPS allocated: 5.27 GB, other allocations: 2.86 GB, max allowed: 9.07 GB). Tried to allocate 1005.38 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [17]:
# saving model
with open('model-01.pkl', 'wb') as f:
    pickle.dump(model, f)
print('model saved')

model saved


In [18]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())

In [19]:
generated_chars

'\nstily thearn way."\n\n"But they horse-who sere," add; "then think you!" call, so each Ung\nthis Kan Eureka is every yeall little, and by ansome feotiod not o ince. "Lack of the Slike I,\ncreary we once I back and time?"\n\nThe Wizard.\n\n"What snudde.\n\n"Mat you."\n\n"But," said the tooke. "But you trange we\'rl armso feet mount abon."\n\n"Shese we bow," as? for you rempt, but make a didnspont those topen\nwon tonly our a friend feephacinus and front until."\n\n"Only be an\'t sure. Eureka creatures and Gwig, mury'

In [66]:
def chat():
    while True:
        prompt = input('Prompt: \n')
        context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
        # unsqueeze: taking away the wrapping
        generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=150)[0].tolist())
        print(f'Completion:\n{generated_chars}')