# Notebook 7: Demystifying Inference - From Scratch to High-Speed Engines

Training is done. Now, how do we use the model to generate text? This process is called **inference**.

In this notebook, we will:
1. First deconstruct our own `.generate()` method to understand the core logic
2. Explore different ways to control text generation (sampling strategies)
3. Learn why specialized "inference engines" are necessary for real-world speed

Understanding inference is crucial because it's the bridge between a trained model and a real-world application. Let's dive in!


## Revisiting Our "From Scratch" Generator

Let's load the trained GPT model from the previous notebook and understand the autoregressive generation loop step by step.

The autoregressive loop works like this:

1. **Start with an initial context** (the prompt). This is the seed text that starts our generation.

2. **Pass the context to the model** to get logits (scores) for the next token. The model processes all tokens in the context simultaneously using self-attention.

3. **Convert logits into probabilities** using softmax. This transforms raw scores into a probability distribution over all possible tokens.

4. **Sample a single token** from this probability distribution. This is where the creative decision happens—how we sample determines the style and quality of the generated text.

5. **Append the sampled token** to our context. The context grows by one token.

6. **Repeat from step 2** until we reach the desired length. Each iteration generates one more token.

The key creative decision happens in **Step 4**: how we sample from the probability distribution. Different sampling strategies produce dramatically different results, from conservative and predictable to wild and creative.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# First, let's recreate the GPT model structure from notebook 6
# (In practice, you'd load a saved model checkpoint)

class Head(nn.Module):
    def __init__(self, n_embd, head_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        out = F.scaled_dot_product_attention(
            q, k, v,
            is_causal=True,
            dropout_p=self.dropout.p if self.training else 0.0
        )
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, n_embd, num_heads, dropout=0.1):
        super().__init__()
        assert n_embd % num_heads == 0
        self.num_heads = num_heads
        self.head_size = n_embd // num_heads
        self.n_embd = n_embd
        self.heads = nn.ModuleList([Head(n_embd, self.head_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, num_heads, dropout=0.1):
        super().__init__()
        self.sa = MultiHeadAttention(n_embd, num_heads, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, num_heads, num_layers, dropout=0.1):
        super().__init__()
        self.block_size = block_size
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, num_heads, dropout) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """Original generate method - uses multinomial sampling."""
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:] if idx.shape[1] >= self.block_size else idx
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        
        self.train()
        return idx

# Create a simple vocabulary for demonstration
text = """
The quick brown fox jumps over the lazy dog. 
The dog barks at the fox. The fox runs away quickly.
Machine learning is fascinating. Deep learning models can understand language.
Transformers are powerful architectures. Attention mechanisms enable long-range dependencies.
"""
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Create a small model (for demonstration - in practice, load your trained model)
model = GPTLanguageModel(
    vocab_size=vocab_size,
    n_embd=64,
    block_size=128,
    num_heads=4,
    num_layers=2,
    dropout=0.1
).to(device)

print(f"Model created with vocabulary size: {vocab_size}")
print(f"Model ready for inference experiments!")


## Sampling Strategy 1: Greedy Decoding

This is the simplest strategy. At each step, we always choose the token with the **absolute highest probability**.

### Pros and Cons

**Pros:**
- Fast and deterministic (same input always produces same output)
- Straightforward to implement

**Cons:**
- Often leads to boring, repetitive, and robotic-sounding text
- Never takes a creative risk—always picks the "safe" choice
- Can get stuck in loops (e.g., "the the the the...")

Greedy decoding is like a student who always picks the most obvious answer on a multiple-choice test, never considering that sometimes the second or third choice might be more interesting or creative.


In [None]:
def generate_greedy(model, idx, max_new_tokens):
    """
    Greedy decoding: always pick the token with highest probability.
    Uses torch.argmax() instead of torch.multinomial().
    """
    model.eval()
    for _ in range(max_new_tokens):
        # Crop to block_size if needed
        idx_cond = idx[:, -model.block_size:] if idx.shape[1] >= model.block_size else idx
        
        # Get predictions
        logits, _ = model(idx_cond)
        # Focus only on the last time step
        logits = logits[:, -1, :]  # (B, C)
        
        # Greedy: pick the token with highest probability
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (B, 1)
        
        # Append to sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
    
    model.train()
    return idx

# Test greedy decoding
prompt = "The quick brown fox"
encoded_prompt = encode(prompt)
context = torch.tensor([encoded_prompt], dtype=torch.long, device=device)

print("="*60)
print("Greedy Decoding:")
print("="*60)
print(f"Prompt: '{prompt}'")
print("\nGenerated text:")
generated = generate_greedy(model, context, max_new_tokens=50)
generated_text = decode(generated[0].tolist())
print(generated_text)
print("\nNotice how the text is deterministic but may be repetitive.")


## Sampling Strategy 2: Temperature Sampling

Temperature is a parameter that controls the **randomness** of our sampling. It's applied to the logits before the softmax.

### Understanding Temperature

**Low Temperature (e.g., 0.2):**
- Makes the model more confident and less random
- Sharpens the probability distribution, making high-probability tokens even more likely and low-probability ones almost impossible
- Like a careful, conservative writer who sticks to safe choices
- Good for: factual writing, code generation, when you want consistency

**High Temperature (e.g., 1.5):**
- Makes the model more creative and random
- Flattens the distribution, making less likely words more probable
- Like a wild, experimental writer who takes creative risks
- Good for: creative writing, brainstorming, when you want variety

**Temperature = 1.0:**
- The default setting—uses the raw probabilities from the model
- Balanced between creativity and coherence

Think of temperature like a dial: turn it down for predictable, safe text; turn it up for creative, surprising text.
Notebook 8: The Broader Ecosystem - Hugging Face & The Fine-Tuning Workflow

LLM, generate a Jupyter Notebook with the following cells and content:

    Markdown Cell 1 (Title):

        # Notebook 8: The Broader Ecosystem - Hugging Face & The Fine-Tuning Workflow

        Explain the goal: We've built models from scratch. Now, let's learn the standard, practical workflow that professionals use. This involves leveraging the vast Hugging Face ecosystem to download pre-trained models and adapt them to new tasks.

    Markdown Cell 2 (The Hugging Face Hub: A "GitHub" for AI):

        Introduce the Hub as the central place where the community shares models, datasets, and tokenizers.

        Explain that everything has a unique ID (e.g., "openai-community/gpt2", "meta-llama/Meta-Llama-3-8B").

    Code Cell 1 (Exploring the Hub):

        Show how to use the huggingface_hub library to programmatically list models and datasets. This makes the Hub feel like a tangible resource, not just a website.

    Markdown Cell 3 (The Easy Button: The pipeline API):

        Reiterate that the pipeline is the highest-level, easiest way to use a model for a specific task.

        Explain that it abstracts away all the steps: loading the model, tokenization, inference, and decoding.

    Code Cell 2 (Demonstrating Pipelines):

        Showcase a few different pipelines to highlight their versatility:

            pipeline('text-generation', model='gpt2')

            pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

            pipeline('fill-mask', model='bert-base-uncased')

    Markdown Cell 4 (The Standard Workflow: AutoModel and AutoTokenizer):

        Explain that for more control, we drop down a level from pipeline to AutoModel and AutoTokenizer.

        AutoTokenizer.from_pretrained(model_id): Loads the correct tokenizer for any model.

        AutoModelForCausalLM.from_pretrained(model_id): Loads the model architecture and its pre-trained weights.

        Explain that we will now manually replicate the steps that pipeline does automatically.

    Code Cell 3 (Manual Inference with Auto classes):

            Load the tokenizer and model for "gpt2".

            Define a prompt string.

            Tokenize: inputs = tokenizer(prompt, return_tensors="pt"). Print the inputs object and its shape. Explain input_ids and attention_mask.

            Inference: outputs = model(**inputs). Pass the tokenized inputs to the model.

            Inspect Logits: Get the logits from the output. Print their shape (batch_size, sequence_length, vocab_size) and explain what it means.

            Decode: Use the tokenizer's .decode() method to turn the model's output back into readable text. This cell is crucial as it connects all our from-scratch knowledge to the standard library workflow.

    Markdown Cell 5 (Fine-Tuning: Adapting a Genius for a New Job):

        Explain the concept of fine-tuning. "Why spend millions training a model from scratch on all of Wikipedia when you can take a pre-trained 'genius' model like Llama 3 and simply show it a few thousand examples of your specific task (e.g., answering customer support questions)? This process of adapting a pre-trained model is called fine-tuning."

    Markdown Cell 6 (The Professional Way: The Trainer API):

        Introduce the transformers Trainer as the high-level tool for fine-tuning.

        Explain that it handles all the training loop boilerplate we wrote manually: the epoch/step loops, moving data to the device, calling loss.backward(), optimizer.step(), evaluation, logging, saving checkpoints, etc.

    Code Cell 4 (A Fine-Tuning Template):

        Provide a non-runnable but complete template of a fine-tuning script. This shows the user the structure without getting bogged down in data preparation.

        The template should include:

            Importing Trainer, TrainingArguments.

            Loading a model and tokenizer (AutoModel...).

            Loading a dataset (e.g., from datasets import load_dataset).

            Defining TrainingArguments with key parameters like output_dir, num_train_epochs, per_device_train_batch_size.

            Instantiating the Trainer with the model, args, and datasets.

            The final, simple call: trainer.train().

    Markdown Cell 7 (Making Fine-Tuning Feasible: QLoRA and Unsloth):

        Explain the modern techniques that make fine-tuning large models possible on consumer hardware.

        QLoRA: "Even fine-tuning can be too much for one GPU. QLoRA is a revolutionary technique. First, it loads the massive base model in a quantized, memory-saving 4-bit format and 'freezes' it. Then, it inserts tiny, 'trainable' adapter layers (called LoRA) into the model. You only train these tiny adapters, which is dramatically faster and uses far less memory than training the whole model." Explain that the peft library from Hugging Face implements this.

        Unsloth: "Unsloth is a performance library. You add two lines of code to your fine-tuning script, and it intelligently replaces standard PyTorch modules with its own hand-written, hyper-optimized kernels. It makes everything—especially QLoRA fine-tuning—run up to 2x faster and use significantly less memory."

    Markdown Cell 8 (Conclusion: Your Journey So Far):

        Summarize the entire series. "You started with a single tensor. You built MLPs, CNNs, and ResNets. You architected a GPT from its fundamental components using Flash Attention. You mastered the logic of inference and sampling. And now, you've seen how the professional ecosystem abstracts these concepts into powerful, reusable tools. You have the full-stack, code-first intuition to tackle any challenge in the world of LLMs."

In [None]:
def generate_with_temp(model, idx, max_new_tokens, temperature=1.0):
    """
    Temperature sampling: divide logits by temperature before softmax.
    Lower temperature = more deterministic, higher temperature = more random.
    """
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -model.block_size:] if idx.shape[1] >= model.block_size else idx
        
        # Get predictions
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]  # (B, C)
        
        # Apply temperature: divide logits by temperature
        logits = logits / temperature
        
        # Convert to probabilities
        probs = F.softmax(logits, dim=-1)
        
        # Sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
        
        # Append to sequence
        idx = torch.cat((idx, idx_next), dim=1)
    
    model.train()
    return idx

# Test with different temperatures
prompt = "The quick brown fox"
encoded_prompt = encode(prompt)
context = torch.tensor([encoded_prompt], dtype=torch.long, device=device)

print("="*60)
print("Temperature Sampling Comparison:")
print("="*60)
print(f"Prompt: '{prompt}'\n")

for temp in [0.2, 1.0, 1.5]:
    ctx = context.clone()
    generated = generate_with_temp(model, ctx, max_new_tokens=50, temperature=temp)
    generated_text = decode(generated[0].tolist())
    print(f"Temperature {temp}:")
    print(f"  {generated_text}")
    print()


## Sampling Strategy 3: Top-k Sampling

This strategy provides a great balance between creativity and coherence. Before sampling, we discard all tokens except for the **k most probable ones**. Then, we sample only from this reduced pool.

### Why Top-k Works

**The Problem:** Sometimes the model assigns tiny probabilities to thousands of tokens. Sampling from the entire vocabulary can occasionally pick a truly bizarre, low-probability token that ruins the coherence.

**The Solution:** Top-k filtering says "only consider the top k most likely tokens." This prevents the model from picking truly bizarre tokens while still allowing for variety among the reasonable top choices.

**Benefits:**
- Maintains creativity (still sampling, not greedy)
- Prevents truly bizarre outputs
- Better balance between coherence and variety
- Works well with temperature sampling (combine both!)

Common values for k: 10-50 for small models, 50-100 for larger models. Too small (k=1) is just greedy decoding. Too large (k=vocab_size) does nothing.


In [None]:
def generate_top_k(model, idx, max_new_tokens, temperature=1.0, top_k=50):
    """
    Top-k sampling: only sample from the top k most probable tokens.
    Combines well with temperature for the best balance of creativity and coherence.
    """
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -model.block_size:] if idx.shape[1] >= model.block_size else idx
        
        # Get predictions
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]  # (B, C)
        
        # Apply temperature
        logits = logits / temperature
        
        # Top-k filtering: mask out all tokens except top k
        if top_k is not None:
            # Get the kth largest value for each batch
            top_k_values, top_k_indices = torch.topk(logits, min(top_k, logits.size(-1)), dim=-1)
            
            # Create a mask: set all tokens not in top-k to -infinity
            # This way they get zero probability after softmax
            logits_filtered = torch.full_like(logits, float('-inf'))
            logits_filtered.scatter_(-1, top_k_indices, top_k_values)
            logits = logits_filtered
        
        # Convert to probabilities
        probs = F.softmax(logits, dim=-1)
        
        # Sample from the filtered distribution
        idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
        
        # Append to sequence
        idx = torch.cat((idx, idx_next), dim=1)
    
    model.train()
    return idx

# Test top-k sampling
prompt = "The quick brown fox"
encoded_prompt = encode(prompt)
context = torch.tensor([encoded_prompt], dtype=torch.long, device=device)

print("="*60)
print("Top-k Sampling:")
print("="*60)
print(f"Prompt: '{prompt}'\n")

for k in [5, 20, 50]:
    ctx = context.clone()
    generated = generate_top_k(model, ctx, max_new_tokens=50, temperature=0.8, top_k=k)
    generated_text = decode(generated[0].tolist())
    print(f"Top-k={k} (with temperature=0.8):")
    print(f"  {generated_text}")
    print()

print("Notice how top-k=5 is more conservative, while top-k=50 allows more variety.")


## The Need for Speed: Why Our Loop is Slow

Our simple generation loop has a major bottleneck: **the KV Cache**.

### The Problem: Recomputing Keys and Values

In the self-attention mechanism, every token creates a **Key** and a **Value** vector. To predict the next token, the new token's **Query** must attend to the Keys and Values of all preceding tokens.

**What our simple loop does:** Every time we generate a new token, we pass the entire context (all previous tokens + the new token) through the model. This means we recalculate the Keys and Values for **every single token** every single time we generate a new token. This is incredibly wasteful!

**Example:** If we're generating the 100th token:
- We recalculate Keys/Values for tokens 1-99 (even though we already computed them!)
- We only need to compute Keys/Values for the new token
- We're doing 99x more work than necessary!

### The Solution: KV Cache

Inference engines solve this by **caching** the Keys and Values. Once a token's K/V vectors are computed, they're stored. When generating the next token, we only compute K/V for the new token and reuse the cached values for all previous tokens.

This simple optimization can make inference **10-100x faster** for long sequences!

### Why This Matters

For a single user generating text, our simple loop might be acceptable. But for production systems serving thousands of users simultaneously, specialized inference engines are essential for:
- Speed: Handling many requests per second
- Efficiency: Using GPU memory wisely
- Cost: Reducing compute costs for serving models


## Inference Engines Explained

Now that we understand the bottlenecks, let's explore the tools that real-world projects use for fast inference. These are the "inference engines" that power ChatGPT, Claude, and other production LLM systems.

### 1. Hugging Face `transformers` `.generate()`

**What it is:** The industry-standard, high-level method for text generation.

**Features:**
- Highly optimized Python implementation
- Includes KV cache automatically
- Dozens of sampling strategies built-in (top-k, top-p, temperature, etc.)
- Handles all the complexity for you

**When to use:** Perfect for prototyping, research, and small-scale production. It's the "easy button" that handles everything correctly.

**Code example:** `model.generate(input_ids, max_length=100, temperature=0.8, top_k=50)`

### 2. vLLM & SGLang

**What they are:** State-of-the-art Python inference servers designed for high throughput.

**Key innovation:** **PagedAttention**. Think of it like a smart memory manager for the KV Cache. It allows the GPU to process many user requests at once (high throughput) with very little wasted memory.

**When to use:** Production systems that need to serve many users simultaneously. These are the engines behind most commercial LLM APIs.

**Why they're fast:** They use advanced techniques like:
- Continuous batching (process multiple requests together efficiently)
- PagedAttention (smart KV cache management)
- Optimized CUDA kernels

### 3. Llama.cpp

**What it is:** A project designed to run LLMs on everyday hardware, like your MacBook's CPU.

**Key features:**
- Written in pure C++ for maximum performance
- Heavy reliance on **quantization**—using less precise numbers (e.g., 4-bit integers instead of 16-bit floats)
- Makes models smaller and math faster
- Can run on CPU, eliminating the need for expensive GPUs

**When to use:** Local inference, edge devices, or when you want to run models without a GPU.

**Why it matters:** Makes LLMs accessible to everyone, not just those with powerful GPUs.

### Summary

- **Hugging Face**: The easy, standard way (Python, optimized, feature-rich)
- **vLLM/SGLang**: Production servers for high throughput (Python, very fast, handles many users)
- **Llama.cpp**: Run on everyday hardware (C++, CPU-friendly, uses quantization)

Each tool solves different problems, but they all share the same goal: make inference fast and efficient!


In [None]:
# A Glimpse of the Professional Way: Hugging Face transformers

# Note: In a real project, you'd install transformers with:
# uv pip install transformers

try:
    from transformers import AutoModelForCausalLM, AutoTokenizer
    
    print("="*60)
    print("Using Hugging Face transformers (Professional Method):")
    print("="*60)
    
    # Load a pre-trained model (GPT-2 is small and fast for demonstration)
    model_name = "gpt2"
    print(f"\nLoading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_hf = AutoModelForCausalLM.from_pretrained(model_name)
    
    # Set padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Prepare input
    prompt = "The quick brown fox"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    print(f"\nPrompt: '{prompt}'")
    print("\nGenerated text (using Hugging Face's optimized .generate()):")
    
    # Use the built-in generate method with various parameters
    # This single call handles KV caching, batching, and all optimizations!
    outputs = model_hf.generate(
        inputs.input_ids,
        max_length=50,
        temperature=0.8,
        top_k=50,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)
    
    print("\n" + "="*60)
    print("Notice how one simple function call handles:")
    print("  - KV caching (automatic)")
    print("  - Temperature sampling")
    print("  - Top-k filtering")
    print("  - All the optimizations we built manually!")
    print("="*60)
    
except ImportError:
    print("transformers library not installed.")
    print("To install: uv pip install transformers")
    print("\nThis is what production code typically looks like:")
    print("""
    from transformers import AutoModelForCausalLM, AutoTokenizer
    
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    
    inputs = tokenizer("The quick brown fox", return_tensors="pt")
    outputs = model.generate(
        inputs.input_ids,
        max_length=100,
        temperature=0.8,
        top_k=50
    )
    text = tokenizer.decode(outputs[0])
    """)
except Exception as e:
    print(f"Error loading model: {e}")
    print("\nThis is expected if transformers isn't installed or if you're offline.")
    print("The code above demonstrates how production systems use Hugging Face.")
