In [10]:
import numpy as np
import torch
import torch.nn as nn
import tiktoken

In [1]:
gpt_config_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768, 
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

- "context_length" denotes the maximum number of input tokens the
model can handle, via the positional embeddings discussed in chapter 2.
- "emb_dim" represents the embedding size, transforming each token into
a 768-dimensional vector.
- "n_heads" indicates the count of attention heads in the multi-head
attention mechanism, as implemented in chapter 3.
- "n_layers" specifies the number of transformer blocks in the model,
which will be elaborated on in upcoming sections.

In [209]:
# Multi Head Attention Class
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_in, d_out, context_length, dropout,num_heads, qkv_bias=False):
        super().__init__()
        assert d_out%num_heads==0, 'd_out must be divisible by num_heads'
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads #A
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out) #B
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
        'mask',
        torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x) #C
        queries = self.W_query(x) #C
        values = self.W_value(x) #C
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) #D
        values = values.view(b, num_tokens, self.num_heads, self.head_dim) #D
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)#D
        keys = keys.transpose(1, 2) #E
        queries = queries.transpose(1, 2) #E
        values = values.transpose(1, 2) #E
        attn_scores = queries @ keys.transpose(2, 3) #F
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens] #G
        attn_scores.masked_fill_(mask_bool, -torch.inf) #H
        attn_weights = torch.softmax(
        attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = (attn_weights @ values).transpose(1, 2) #I
        #J
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) #K
        return context_vec


- The **FeedForward** module we implemented in this section plays a crucial role
in enhancing the model's ability to learn from and generalize the data.
Although the input and output dimensions of this module are the same, it
internally expands the embedding dimension into a higher-dimensional space
through the first linear layer. This expansion is
followed by a non-linear GELU activation, and then a contraction back to the
original dimension with the second linear transformation. Such a design
allows for the exploration of a richer representation space.

In [239]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))*(x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']),
            GELU(),
            nn.Linear(4*cfg['emb_dim'], cfg['emb_dim'])
        )
    def forward(self, x):
        return self.layers(x)


# Transformer Block
# input - LN1 - MHA - Drpout - (+input) - 
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        vocab_size, context_length, emb_dim, num_heads, n_layers, drop_rate, qkv_bias = cfg.values()
        embedding_layer = nn.Embedding(vocab_size, emb_dim)
        self.layernorm1 = LayerNorm(emb_dim)
        self.layernorm2 = LayerNorm(emb_dim)
        self.mha = MultiHeadAttention(d_in=emb_dim, d_out=emb_dim, context_length=context_length, dropout=drop_rate,
                           num_heads=num_heads, qkv_bias=qkv_bias )
        self.dropout = nn.Dropout(p=drop_rate)
        self.ffn = FeedForward(cfg)
    def forward(self, x):
        #A
        skip_connection=x
        x = self.layernorm1(x) # pre normalization (before multihead atention and feed forward)
        x = self.mha(x)
        x = self.dropout(x)
        x = x + skip_connection

        #B
        skip_connection=x
        x = self.layernorm2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = x+skip_connection
        
        return x

### Final GPT2 architecture

In [None]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        

In [None]:
os_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
x = tok_embeds + pos_embed

In [235]:

gpt_config_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768, 
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}


torch.manual_seed(123)
x = torch.rand(2,4,768)
trf_block= TransformerBlock(gpt_config_124M)
output = trf_block(x)

x.shape, output.shape



(torch.Size([2, 4, 768]), torch.Size([2, 4, 768]))

In [236]:
 vocab_size, context_length, emb_dim, n_heads, n_layers, drop_rate, qkv_bias = gpt_config_124M.values()

tokenizer = tiktoken.get_encoding('gpt2')
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
token_batch = torch.stack(batch,dim=0)
print(token_batch)

tok_embedding_layer = nn.Embedding(vocab_size, emb_dim)
pos_embedding_layer = nn.Embedding(context_length, emb_dim)
# embedding_batch =tok_embedding_layer(token_batch) + pos_embedding_layer(torch.arange(2))

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [238]:
*["Alice", 30, "USA"]


SyntaxError: can't use starred expression here (3829730376.py, line 1)

In [226]:
tok_embedding_layer, pos_embedding_layer

(Embedding(50257, 768), Embedding(1024, 768))