# Week 2: Transformers and LLM System Design

## Learning Objectives
- Understand the evolution of Transformer models
- Learn about encoder-decoder architectures
- Explore attention mechanisms
- Design LLM systems and understand their components
- Implement basic transformer components

## Table of Contents
1. [Evolution of Language Models](#evolution-of-language-models)
2. [Attention Mechanism](#attention-mechanism)
3. [Transformer Architecture](#transformer-architecture)
4. [Encoder-Decoder Design](#encoder-decoder-design)
5. [LLM System Components](#llm-system-components)
6. [Implementation Examples](#implementation-examples)
7. [Exercises](#exercises)

In [None]:
# Install required packages
!pip install torch transformers datasets tokenizers matplotlib seaborn numpy pandas
!pip install torch-audio torchvision

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification,
    BertModel, GPT2Model, T5Model, pipeline
)
import math
from typing import Optional

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Evolution of Language Models

Let's explore the evolution from simple n-gram models to modern Transformers.

In [None]:
# Timeline of language model evolution
evolution_timeline = {
    "N-gram Models (1990s)": "Statistical models based on word sequences",
    "Neural Language Models (2003)": "Feed-forward neural networks for language modeling",
    "RNNs/LSTMs (2010s)": "Recurrent networks for sequential data",
    "Seq2Seq (2014)": "Encoder-decoder architecture with RNNs",
    "Attention (2015)": "Attention mechanism for better long-range dependencies",
    "Transformer (2017)": "Self-attention based architecture",
    "BERT (2018)": "Bidirectional encoder representations",
    "GPT (2018-present)": "Generative pre-trained transformers",
    "T5 (2019)": "Text-to-text transfer transformer",
    "Large Models (2020+)": "GPT-3, PaLM, ChatGPT, GPT-4"
}

for year, description in evolution_timeline.items():
    print(f"{year}: {description}")

## Attention Mechanism

The attention mechanism allows models to focus on relevant parts of the input sequence.

In [None]:
class SimpleAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.W_q = nn.Linear(hidden_size, hidden_size, bias=False)
        self.W_k = nn.Linear(hidden_size, hidden_size, bias=False)
        self.W_v = nn.Linear(hidden_size, hidden_size, bias=False)
        
    def forward(self, query, key, value, mask=None):
        # Compute attention scores
        Q = self.W_q(query)  # (batch_size, seq_len, hidden_size)
        K = self.W_k(key)    # (batch_size, seq_len, hidden_size)
        V = self.W_v(value)  # (batch_size, seq_len, hidden_size)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.hidden_size)
        
        if mask is not None:
            scores.masked_fill_(mask == 0, -1e9)
            
        attention_weights = F.softmax(scores, dim=-1)
        attended_values = torch.matmul(attention_weights, V)
        
        return attended_values, attention_weights

# Example usage
hidden_size = 64
seq_len = 10
batch_size = 2

attention = SimpleAttention(hidden_size)
x = torch.randn(batch_size, seq_len, hidden_size)

output, weights = attention(x, x, x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")

In [None]:
# Visualize attention weights
def visualize_attention(attention_weights, tokens=None):
    """Visualize attention weights as a heatmap"""
    # Take the first sample from the batch
    weights = attention_weights[0].detach().numpy()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(weights, annot=True, cmap='Blues', fmt='.2f')
    plt.title('Attention Weights Heatmap')
    plt.xlabel('Key Position')
    plt.ylabel('Query Position')
    
    if tokens:
        plt.xticks(range(len(tokens)), tokens, rotation=45)
        plt.yticks(range(len(tokens)), tokens, rotation=0)
    
    plt.tight_layout()
    plt.show()

# Visualize the attention weights from our example
visualize_attention(weights)

## Multi-Head Attention

Multi-head attention allows the model to attend to information from different representation subspaces.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores.masked_fill_(mask == 0, -1e9)
            
        attention_weights = F.softmax(scores, dim=-1)
        attended_values = torch.matmul(attention_weights, V)
        
        return attended_values, attention_weights
    
    def forward(self, query, key, value, mask=None):
        batch_size, seq_len, d_model = query.size()
        
        # Linear projections
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)
        
        # Reshape for multi-head attention
        Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        # Apply attention
        attended_values, attention_weights = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Concatenate heads
        attended_values = attended_values.transpose(1, 2).contiguous().view(
            batch_size, seq_len, d_model
        )
        
        # Final linear projection
        output = self.W_o(attended_values)
        
        return output, attention_weights

# Example usage
d_model = 512
num_heads = 8
seq_len = 20
batch_size = 2

mha = MultiHeadAttention(d_model, num_heads)
x = torch.randn(batch_size, seq_len, d_model)

output, weights = mha(x, x, x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")

## Transformer Architecture

Let's implement a simplified Transformer block.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_output, _ = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

# Example usage
d_model = 512
num_heads = 8
d_ff = 2048
seq_len = 20
batch_size = 2

transformer_block = TransformerBlock(d_model, num_heads, d_ff)
pos_encoding = PositionalEncoding(d_model)

x = torch.randn(batch_size, seq_len, d_model)
x_with_pos = pos_encoding(x)
output = transformer_block(x_with_pos)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")

## Encoder-Decoder Design

Understanding different architectural patterns in Transformers.

In [None]:
# Compare different model architectures
model_architectures = {
    "BERT": {
        "type": "Encoder-only",
        "use_case": "Understanding tasks (classification, NER, QA)",
        "attention": "Bidirectional",
        "training": "Masked Language Modeling"
    },
    "GPT": {
        "type": "Decoder-only",
        "use_case": "Generation tasks (text completion, dialogue)",
        "attention": "Causal (left-to-right)",
        "training": "Autoregressive Language Modeling"
    },
    "T5": {
        "type": "Encoder-Decoder",
        "use_case": "Text-to-text tasks (translation, summarization)",
        "attention": "Bidirectional encoder + Causal decoder",
        "training": "Span-based denoising"
    }
}

for model, details in model_architectures.items():
    print(f"\n{model}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

In [None]:
# Load and compare different pre-trained models
def compare_models():
    # BERT model
    bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    bert_model = AutoModel.from_pretrained('bert-base-uncased')
    
    # GPT-2 model
    gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
    gpt2_model = GPT2Model.from_pretrained('gpt2')
    
    # Add padding token for GPT-2
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
    
    text = "The quick brown fox jumps over the lazy dog."
    
    # BERT encoding
    bert_inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs)
    
    # GPT-2 encoding
    gpt2_inputs = gpt2_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        gpt2_outputs = gpt2_model(**gpt2_inputs)
    
    print(f"Text: {text}")
    print(f"\nBERT:")
    print(f"  Input tokens: {bert_tokenizer.convert_ids_to_tokens(bert_inputs['input_ids'][0])}")
    print(f"  Output shape: {bert_outputs.last_hidden_state.shape}")
    print(f"  Model size: {sum(p.numel() for p in bert_model.parameters()):,} parameters")
    
    print(f"\nGPT-2:")
    print(f"  Input tokens: {gpt2_tokenizer.convert_ids_to_tokens(gpt2_inputs['input_ids'][0])}")
    print(f"  Output shape: {gpt2_outputs.last_hidden_state.shape}")
    print(f"  Model size: {sum(p.numel() for p in gpt2_model.parameters()):,} parameters")

compare_models()

## LLM System Components

Understanding the key components of LLM systems.

In [None]:
class LLMSystemComponents:
    """
    A class to demonstrate the key components of an LLM system
    """
    
    def __init__(self):
        self.components = {
            "Tokenization": {
                "purpose": "Convert text to tokens",
                "types": ["Byte-Pair Encoding (BPE)", "SentencePiece", "WordPiece"],
                "considerations": ["Vocabulary size", "OOV handling", "Subword tokenization"]
            },
            "Model Architecture": {
                "purpose": "Process and transform inputs",
                "types": ["Encoder-only", "Decoder-only", "Encoder-Decoder"],
                "considerations": ["Model size", "Context length", "Attention patterns"]
            },
            "Training Strategy": {
                "purpose": "Learn from data",
                "types": ["Pre-training", "Fine-tuning", "RLHF"],
                "considerations": ["Data quality", "Compute resources", "Training stability"]
            },
            "Inference Engine": {
                "purpose": "Generate outputs efficiently",
                "types": ["Autoregressive", "Parallel", "Speculative decoding"],
                "considerations": ["Latency", "Throughput", "Memory usage"]
            },
            "Deployment Infrastructure": {
                "purpose": "Serve the model at scale",
                "types": ["Cloud-based", "Edge deployment", "Hybrid"],
                "considerations": ["Scalability", "Cost", "Security"]
            }
        }
    
    def describe_component(self, component_name):
        if component_name in self.components:
            comp = self.components[component_name]
            print(f"\n{component_name}:")
            print(f"  Purpose: {comp['purpose']}")
            print(f"  Types: {', '.join(comp['types'])}")
            print(f"  Key Considerations: {', '.join(comp['considerations'])}")
        else:
            print(f"Component '{component_name}' not found.")
    
    def list_all_components(self):
        print("LLM System Components:")
        for component in self.components.keys():
            self.describe_component(component)

# Example usage
llm_system = LLMSystemComponents()
llm_system.list_all_components()

## Implementation Examples

Let's implement some practical examples using pre-trained models.

In [None]:
# Text classification with BERT
def text_classification_example():
    # Load pre-trained model
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    classifier = pipeline("sentiment-analysis", model=model_name)
    
    # Test texts
    texts = [
        "I love this new technology!",
        "This is terrible and disappointing.",
        "The weather is okay today."
    ]
    
    print("Text Classification Results:")
    for text in texts:
        result = classifier(text)
        print(f"Text: '{text}'")
        print(f"Sentiment: {result[0]['label']} (confidence: {result[0]['score']:.3f})")
        print("-" * 50)

text_classification_example()

In [None]:
# Text generation with GPT-2
def text_generation_example():
    generator = pipeline("text-generation", model="gpt2")
    
    prompt = "The future of artificial intelligence is"
    
    # Generate text with different parameters
    results = generator(
        prompt,
        max_length=100,
        num_return_sequences=3,
        temperature=0.7,
        do_sample=True,
        pad_token_id=generator.tokenizer.eos_token_id
    )
    
    print(f"Prompt: '{prompt}'")
    print("\nGenerated texts:")
    for i, result in enumerate(results, 1):
        print(f"\n{i}. {result['generated_text']}")

text_generation_example()

In [None]:
# Question Answering with BERT
def question_answering_example():
    qa_pipeline = pipeline("question-answering")
    
    context = """
    The Transformer is a deep learning model introduced in 2017, used primarily in the field of 
    natural language processing (NLP). Like recurrent neural networks (RNNs), Transformers are 
    designed to handle sequential input data, such as natural language, for tasks such as 
    translation and text summarization. However, unlike RNNs, Transformers do not require that 
    the sequential data be processed in order. Since Transformers can process data in parallel, 
    they are much faster to train than RNNs.
    """
    
    questions = [
        "When was the Transformer introduced?",
        "What field primarily uses Transformers?",
        "Why are Transformers faster to train than RNNs?"
    ]
    
    print("Question Answering Results:")
    for question in questions:
        result = qa_pipeline(question=question, context=context)
        print(f"Q: {question}")
        print(f"A: {result['answer']} (confidence: {result['score']:.3f})")
        print("-" * 50)

question_answering_example()

## Exercises

### Exercise 1: Implement a Simple Transformer for Sequence Classification

In [None]:
class SimpleTransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, num_classes, max_len=512):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_model * 4)
            for _ in range(num_layers)
        ])
        
        self.classifier = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x, mask=None):
        # Embedding and positional encoding
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        # Pass through transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer(x, mask)
        
        # Global average pooling
        x = x.mean(dim=1)
        
        # Classification
        return self.classifier(x)

# Example usage
vocab_size = 10000
d_model = 256
num_heads = 8
num_layers = 6
num_classes = 3
seq_len = 50
batch_size = 4

model = SimpleTransformerClassifier(vocab_size, d_model, num_heads, num_layers, num_classes)
sample_input = torch.randint(0, vocab_size, (batch_size, seq_len))

output = model(sample_input)
print(f"Input shape: {sample_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

### Exercise 2: Compare Model Behaviors

In [None]:
def compare_model_behaviors():
    """Compare how different models handle the same input"""
    
    text = "The cat sat on the [MASK]."
    
    # BERT for masked language modeling
    bert_pipeline = pipeline("fill-mask", model="bert-base-uncased")
    bert_results = bert_pipeline(text)
    
    print("BERT Masked Language Modeling:")
    print(f"Input: {text}")
    for result in bert_results[:3]:
        print(f"  {result['sequence']} (score: {result['score']:.3f})")
    
    # GPT-2 for text generation
    gpt2_pipeline = pipeline("text-generation", model="gpt2")
    prompt = "The cat sat on the"
    gpt2_results = gpt2_pipeline(
        prompt, 
        max_length=len(prompt.split()) + 5,
        num_return_sequences=3,
        do_sample=True,
        temperature=0.7,
        pad_token_id=gpt2_pipeline.tokenizer.eos_token_id
    )
    
    print(f"\nGPT-2 Text Generation:")
    print(f"Prompt: {prompt}")
    for result in gpt2_results:
        print(f"  {result['generated_text']}")

compare_model_behaviors()

### Exercise 3: Analyze Attention Patterns

In [None]:
def analyze_attention_patterns():
    """Analyze attention patterns in a pre-trained BERT model"""
    from transformers import BertTokenizer, BertModel
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
    
    text = "The quick brown fox jumps over the lazy dog."
    inputs = tokenizer(text, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get attention weights from the last layer, first head
    attention = outputs.attentions[-1][0, 0].numpy()
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    print(f"Text: {text}")
    print(f"Tokens: {tokens}")
    print(f"Attention shape: {attention.shape}")
    
    # Visualize attention
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        attention, 
        xticklabels=tokens, 
        yticklabels=tokens, 
        annot=True, 
        fmt='.2f',
        cmap='Blues'
    )
    plt.title('BERT Attention Weights (Last Layer, Head 1)')
    plt.xlabel('Key Tokens')
    plt.ylabel('Query Tokens')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

analyze_attention_patterns()

## Summary

In this module, we covered:
- Evolution from RNNs to Transformers
- Attention mechanisms and multi-head attention
- Transformer architecture components
- Different model designs (encoder-only, decoder-only, encoder-decoder)
- LLM system components and considerations
- Practical implementations and examples

## Next Steps
In the next module, we'll explore semantic search and retrieval systems, building on the transformer foundations we've established here.

## Additional Resources
- [Attention Is All You Need (Original Transformer Paper)](https://arxiv.org/abs/1706.03762)
- [BERT: Pre-training of Deep Bidirectional Transformers](https://arxiv.org/abs/1810.04805)
- [Language Models are Unsupervised Multitask Learners (GPT-2)](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
- [The Illustrated Transformer](http://jalammar.github.io/illustrated-transformer/)
- [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/index)