In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Deep Dive into GPT‑OSS‑20B: Architecture, Fine‑Tuning, and Deployment

This notebook guides advanced practitioners through the inner workings of the GPT‑OSS‑20B model, covering its transformer architecture, tokenization pipeline, fine‑tuning strategies, and scalable deployment. It balances theory with hands‑on code, enabling researchers to experiment with custom heads, LoRA adapters, and TorchServe deployment while understanding performance trade‑offs.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T03:03:58.836Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Explain the architectural components and scaling laws that underpin GPT‑OSS‑20B.
2. Demonstrate how to fine‑tune the model on domain‑specific corpora using LoRA and full‑parameter updates.
3. Showcase efficient deployment patterns with TorchServe and DeepSpeed for low‑latency inference.
4. Evaluate performance bottlenecks and provide profiling techniques for GPU‑bound workloads.


## Prerequisites

- Python 3.10+ with PyTorch 2.0+
- Familiarity with Hugging Face Transformers and the transformer architecture
- Basic knowledge of distributed training (accelerate, DeepSpeed)


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 torch>=2.0 transformers>=4.40 accelerate>=0.28 datasets>=2.20 deepspeed>=0.12 torchserve>=0.10
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","torch>=2.0","transformers>=4.40","accelerate>=0.28","datasets>=2.20","deepspeed>=0.12","torchserve>=0.10"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


## Step 1: Environment Setup and Model Loading

Before we can play with GPT‑OSS‑20B, we need to make sure the playground is ready. Think of the environment as a kitchen: you need the right appliances (Python, PyTorch, Transformers), the right ingredients (packages), and a recipe (the code that pulls the model from Hugging Face). In this section we’ll:

1. **Install the required libraries** – we’ll use `pip` to grab the latest versions of `torch`, `transformers`, `accelerate`, `datasets`, `deepspeed`, and `torchserve`.
2. **Set up authentication** – Hugging Face hosts the model weights behind a token. We’ll read the `HF_TOKEN` from a `.env` file.
3. **Verify the GPU** – GPT‑OSS‑20B is a 20‑billion‑parameter beast; it needs a CUDA‑capable GPU to run efficiently.
4. **Load the model and tokenizer** – we’ll pull the pre‑trained weights and the associated tokenizer, and do a quick sanity check.

### Extra explanatory paragraph

- **Environment**: In software terms, an environment is a sandbox that contains all the libraries, variables, and hardware settings required to run a program. It’s like a clean table where you can cook without worrying about leftover crumbs.
- **Model Loading**: This is the process of fetching the neural network weights from disk or the internet and constructing the computational graph in memory. For large models, this step can be memory‑intensive and time‑consuming.
- **Tokenizer**: GPT‑OSS‑20B uses a byte‑pair‑encoding (BPE) tokenizer that splits text into sub‑word tokens. The tokenizer must be loaded alongside the model because the model expects token IDs, not raw text.
- **Reproducibility**: Setting a random seed (`torch.manual_seed(42)`) ensures that any stochastic operations (e.g., dropout, weight initialization) produce the same results every run, which is essential for debugging and benchmarking.
- **Trade‑offs**: Loading the full 20B model on a single GPU can exceed memory limits. In practice, you might use model parallelism or a quantized version. For this notebook, we’ll assume a 80 GB GPU (e.g., A100) is available.

With that context, let’s get the kitchen ready!



In [None]:
# Install required packages (run once)
# Note: In a notebook, you can use !pip, but here we show the command for clarity.
# !pip install -U ipywidgets>=8.0.0 torch>=2.0 transformers accelerate datasets deepspeed torchserve

# Import libraries
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set reproducibility seed
torch.manual_seed(42)

# Load Hugging Face token from environment
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
    raise EnvironmentError("HF_TOKEN not found in environment. Please set it in your .env file.")

# Verify CUDA availability
if not torch.cuda.is_available():
    raise RuntimeError("CUDA not available. GPT‑OSS‑20B requires a GPU.")

# Load tokenizer and model
model_name = "gpt-oss-20b"
print("Downloading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=HF_TOKEN)
print("Downloading model…")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # use FP16 for memory efficiency
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",          # automatically place layers on GPU
    token=HF_TOKEN
)

# Quick sanity check
sample_text = "Once upon a time"
inputs = tokenizer(sample_text, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))



### What just happened?

1. **Package installation** – we pulled the latest releases of the core libraries.
2. **Seed setting** – ensures deterministic behavior.
3. **Token loading** – the `HF_TOKEN` is required to access the private model weights.
4. **CUDA check** – GPT‑OSS‑20B is too big for CPU; we confirm a GPU is present.
5. **Tokenizer & model download** – the `from_pretrained` calls fetch the weights and build the model on the GPU.
6. **Sanity test** – we generate a short continuation of "Once upon a time" to confirm everything works.

If you see a long download time or memory error, you might need to switch to a quantized or sharded version of the model.



## Step 2: GPT‑OSS‑20B Architecture Deep‑Dive

Imagine the model as a giant Lego tower. Each Lego block is a *transformer layer* that takes a stack of numbers (token embeddings) and transforms them into a new stack that carries more meaning. The tower is built from the ground up:

1. **Token Embedding** – turns words into vectors, like turning a sentence into a list of Lego bricks.
2. **Positional Encoding** – adds a sense of order, so the tower knows which brick comes first, second, etc.
3. **Transformer Blocks** – the core of the tower. Each block has:
   * **Multi‑Head Self‑Attention** – lets every brick look at every other brick to decide how much it should pay attention to each other.
   * **Layer Normalization** – keeps the bricks’ heights balanced so the tower doesn’t wobble.
   * **Feed‑Forward Network (FFN)** – a tiny neural net that adds extra nuance to each brick.
   * **Residual Connection** – a shortcut that adds the original brick back on top of the transformed one, like a safety rail.
4. **Output Head** – turns the final stack back into probabilities over the vocabulary.

The *scaling laws* tell us that if we double the number of layers, heads, or hidden size, the model’s performance improves roughly linearly, but the compute and memory cost grows super‑linearly. GPT‑OSS‑20B balances 20 billion parameters across 32 layers, 32 heads, and a hidden size of 4 096, which is a sweet spot for many research workloads.

### Extra explanatory paragraph

- **Transformer Block**: A modular unit that applies self‑attention and a feed‑forward network, wrapped with layer normalization and residuals. Think of it as a single “processing station” that can be stacked many times.
- **Self‑Attention**: A mechanism that lets each token weigh every other token’s contribution, enabling the model to capture long‑range dependencies.
- **Layer Normalization**: Normalizes activations across the feature dimension, stabilizing training and improving convergence.
- **Residual Connection**: Adds the input of a block to its output, which helps gradients flow through very deep networks.
- **Positional Encoding**: Injects token order information because the attention mechanism itself is permutation‑invariant.
- **Scaling Laws**: Empirical relationships that predict how model performance scales with parameters, compute, and data. They guide decisions about architecture size.
- **Trade‑offs**: Larger hidden sizes increase expressivity but also memory and compute. More layers deepen the network but can lead to diminishing returns if not paired with enough data.

Below we’ll pull the model’s configuration from Hugging Face and print a concise summary of its architecture.



In [None]:
# Inspect GPT‑OSS‑20B architecture
# ------------------------------------------------------------
# 1️⃣  Load the config (no weights needed, fast)
# 2️⃣  Print key hyper‑parameters
# 3️⃣  Compute total parameter count (optional, may be heavy)
# ------------------------------------------------------------

import torch
from transformers import AutoConfig

# Set reproducibility for any random ops (none here, but good practice)
torch.manual_seed(42)

# Load the model configuration
config = AutoConfig.from_pretrained("gpt-oss-20b")

# Quick summary
print("\n=== GPT‑OSS‑20B Config Summary ===")
print(f"Model name: {config._name_or_path}")
print(f"# layers: {config.num_hidden_layers}")
print(f"# attention heads: {config.num_attention_heads}")
print(f"Hidden size: {config.hidden_size}")
print(f"Intermediate size (FFN): {config.intermediate_size}")
print(f"Vocabulary size: {config.vocab_size}")
print(f"Max position embeddings: {config.max_position_embeddings}")
print(f"Attention dropout: {config.attention_dropout}")
print(f"Layer norm eps: {config.layer_norm_eps}")

# Optional: compute total parameters (may take a few seconds)
# We load the full model in fp16 to keep memory usage reasonable.
from transformers import AutoModelForCausalLM

print("\nLoading model to count parameters…")
model = AutoModelForCausalLM.from_pretrained(
    "gpt-oss-20b",
    torch_dtype=torch.float16,
    device_map="cpu",  # keep on CPU for counting only
)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,} (~{total_params/1e9:.2f} B)")



## Step 3: Tokenization, Embedding, and Positional Encoding

### Why do we need all three?
When you hand a sentence to GPT‑OSS‑20B, it doesn’t see words. It sees a *sequence of numbers* that represent sub‑word pieces (tokens). Think of a sentence as a string of beads on a necklace. The tokenizer first cuts the string into beads (tokenization). Each bead is then turned into a vector of numbers that the model can understand (embedding). Finally, because the beads are arranged in a line, the model needs a sense of *order*—that’s where positional encoding comes in.

- **Tokenizer**: Splits raw text into token IDs using a byte‑pair‑encoding (BPE) algorithm. BPE learns frequent sub‑word patterns, so it can handle rare words by breaking them into known pieces.
- **Token Embedding**: Maps each token ID to a dense vector (e.g., 4 096‑dimensional). This is a learned lookup table (`nn.Embedding`). The embedding layer is the first trainable part of the transformer.
- **Positional Encoding**: Adds a unique vector to each token embedding that tells the model where that token sits in the sequence. GPT‑OSS‑20B uses *learned* positional embeddings, one per position up to `max_position_embeddings` (e.g., 32 768).

### Extra explanatory paragraph

- **Tokenization** is the bridge between raw text and the model’s numeric world. It must be deterministic and fast; otherwise, the same sentence could produce different token IDs, breaking reproducibility.
- **Embeddings** are the model’s “dictionary.” They capture semantic relationships: tokens that appear in similar contexts get similar vectors. Training them from scratch would be expensive, so we initialize them with the pre‑trained weights.
- **Positional encoding** solves the *permutation invariance* of self‑attention. Without it, the model would treat "cat sat on the mat" the same as "mat on the sat cat". Learned embeddings allow the model to adapt positional signals during training, but they increase the parameter count by `max_position_embeddings * hidden_size`.
- **Trade‑offs**: Learned positional embeddings give the model more flexibility but tie the sequence length to a fixed maximum. Fixed sinusoidal encodings are memory‑efficient and generalize to longer sequences, but may not capture dataset‑specific positional patterns as well.

Below we’ll walk through a quick demo: encode a sentence, pull its embeddings, inspect the positional vectors, and combine them into the final input representation that the transformer will process.



In [None]:
# 1️⃣ Tokenization demo
# ------------------------------------------------------------
# Load the tokenizer (already downloaded in Step 1)
# ------------------------------------------------------------
import os
import torch
from transformers import AutoTokenizer

# Reproducibility: set seed for any random ops (none here, but good practice)
torch.manual_seed(42)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt-oss-20b", use_fast=True, token=os.getenv("HF_TOKEN"))

# Sample text
sample_text = "The quick brown fox jumps over the lazy dog."

# Encode to token IDs
encoded = tokenizer(sample_text, return_tensors="pt")
print("Token IDs:", encoded["input_ids"][0])
print("Token count:", encoded["input_ids"].size(1))

# Decode back to text to verify determinism
print("Decoded text:", tokenizer.decode(encoded["input_ids"][0], skip_special_tokens=True))



In [None]:
# 2️⃣ Embedding & Positional Encoding demo
# ------------------------------------------------------------
# Pull the embedding layers from the pre‑trained model
# ------------------------------------------------------------
from transformers import AutoModelForCausalLM

# Load the model (weights already cached from Step 1)
model = AutoModelForCausalLM.from_pretrained(
    "gpt-oss-20b",
    torch_dtype=torch.float16,
    device_map="cpu",  # keep on CPU for quick inspection
    token=os.getenv("HF_TOKEN")
)

# Grab the token embedding matrix (vocab_size x hidden_size)
token_emb = model.model.embed_tokens.weight  # shape: (vocab_size, hidden_size)
print("Token embedding shape:", token_emb.shape)

# Grab the positional embedding matrix (max_position_embeddings x hidden_size)
pos_emb = model.model.embed_positions.weight  # shape: (max_position_embeddings, hidden_size)
print("Positional embedding shape:", pos_emb.shape)

# Get embeddings for our sample sentence
input_ids = encoded["input_ids"][0]  # shape: (seq_len,)
seq_len = input_ids.size(0)

# Token embeddings for each position
token_vectors = token_emb[input_ids]  # shape: (seq_len, hidden_size)

# Positional vectors for each position (use first seq_len positions)
pos_vectors = pos_emb[:seq_len]  # shape: (seq_len, hidden_size)

# Combine: simple addition (GPT‑OSS‑20B uses addition)
combined = token_vectors + pos_vectors
print("Combined embedding shape:", combined.shape)

# Quick sanity: print first token's combined vector
print("First token combined vector (first 5 dims):", combined[0][:5])



## Section 4

Thinking...
>We need to produce JSON with section_number 4, title "Step 4: Fine‑Tuning Strategies (LoRA vs Full‑Parameter)", content array with markdown and code cells, callouts array, estimated_tokens 1000, prerequisites_check, next_section_hint. Must follow guidelines: 800-1000 tokens per section. Use beginner-friendly ELI5 language with analogies, but precise technical terms. Add one extra explanatory paragraph defining key terms and explaining rationale/trade-offs. Include executable code wi...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Step 5: Scalable Deployment with TorchServe and DeepSpeed

Deploying a 20‑billion‑parameter model is a lot like shipping a huge ship across the ocean. You need a sturdy hull (the model), a powerful engine (GPU), and a reliable crew (the inference framework). TorchServe gives you the shipyard: it packages the model into a *maritime container* called a **model archive** (`.mar`) and runs a fleet of **workers** that can serve requests in parallel. DeepSpeed, on the other hand, is the *engine* that keeps the ship moving efficiently by sharding the model across multiple GPUs, compressing weights, and using ZeRO‑3 to keep memory usage low.

In this section we’ll:

1. **Wrap GPT‑OSS‑20B in a TorchServe model archive** – this is the equivalent of putting the ship into a sealed container.
2. **Configure DeepSpeed for inference** – we’ll enable ZeRO‑3 and 8‑bit quantization so the ship can sail on a modest GPU cluster.
3. **Launch TorchServe with the DeepSpeed backend** – this is the crew that will handle incoming requests.
4. **Test the deployment with a simple REST call** – just like sending a message to the ship’s bridge.

### Extra explanatory paragraph

- **TorchServe**: An open‑source model serving framework from PyTorch that abstracts away the boilerplate of building a REST API, managing model versions, and scaling workers. Think of it as a harbor that automatically loads your ship (model) and dispatches it to incoming ships (client requests).
- **DeepSpeed**: A deep‑learning optimization library that focuses on training and inference efficiency. For inference, DeepSpeed’s *ZeRO‑3* partitioning splits optimizer states, gradients, and parameters across GPUs, drastically reducing memory per device. The *8‑bit quantization* (via `bitsandbytes`) compresses weights to 8‑bit integers, trading a tiny bit of accuracy for a huge memory win.
- **Model Archive (`.mar`)**: A zip‑like package that contains the model weights, configuration, and a handler script. It’s the single file you deploy to TorchServe.
- **Handler**: A Python class that defines how to preprocess inputs, run inference, and postprocess outputs. For GPT‑OSS‑20B we’ll use the built‑in `TextGenerationHandler`.
- **Trade‑offs**: Using DeepSpeed ZeRO‑3 and 8‑bit quantization reduces memory but can increase inference latency slightly due to additional communication overhead. However, the benefit is that a single node can host multiple concurrent requests without exhausting GPU memory.

With these concepts in mind, let’s build the ship and set it to sail.



In [None]:
# 1️⃣ Create a TorchServe model archive for GPT‑OSS‑20B
# ------------------------------------------------------------
# We’ll use the built‑in `TextGenerationHandler` which expects a Hugging Face model.
# The archive will contain the model weights, config, and a minimal handler.
# ------------------------------------------------------------
import os
import subprocess
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Ensure reproducibility
torch.manual_seed(42)

# Paths
MODEL_NAME = "gpt-oss-20b"
ARCHIVE_NAME = "gpt_oss_20b.mar"

# 1️⃣ Download the model locally (cached from Step 1)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="cpu",  # keep on CPU for packaging
    token=os.getenv("HF_TOKEN")
)

# 2️⃣ Save the model and tokenizer to a temporary directory
tmp_dir = "tmp_gpt_oss_20b"
os.makedirs(tmp_dir, exist_ok=True)
model.save_pretrained(tmp_dir)
AutoTokenizer.from_pretrained(MODEL_NAME, token=os.getenv("HF_TOKEN")).save_pretrained(tmp_dir)

# 3️⃣ Create a simple handler that inherits from TextGenerationHandler
handler_code = """
from ts.torch_handler.base_handler import BaseHandler
from transformers import AutoModelForCausalLM, AutoTokenizer

class TextGenerationHandler(BaseHandler):
    def __init__(self):
        super().__init__()
        self.initialized = False

    def initialize(self, ctx):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModelForCausalLM.from_pretrained(ctx.model_dir, torch_dtype=torch.float16).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(ctx.model_dir)
        self.initialized = True

    def preprocess(self, data):
        # Expect a list of dicts with "text" key
        return [self.tokenizer(d["text"], return_tensors="pt", truncation=True, max_length=512).to(self.device) for d in data]

    def inference(self, inputs):
        return [self.model.generate(**inp, max_new_tokens=50) for inp in inputs]

    def postprocess(self, outputs):
        return [self.tokenizer.decode(out[0], skip_special_tokens=True) for out in outputs]
"""

handler_path = os.path.join(tmp_dir, "handler.py")
with open(handler_path, "w") as f:
    f.write(handler_code)

# 4️⃣ Build the .mar archive using torch-model-archiver
archiver_cmd = [
    "torch-model-archiver",
    "--model-name", MODEL_NAME,
    "--version", "1.0",
    "--serialized-file", os.path.join(tmp_dir, "pytorch_model.bin"),
    "--handler", "handler.py",
    "--extra-files", f"{tmp_dir}/config.json,{tmp_dir}/tokenizer.json,{tmp_dir}/vocab.json,{tmp_dir}/merges.txt",
    "--export-path", ".",
    "--archive-format", "mar"
]
subprocess.run(archiver_cmd, check=True)
print(f"✅ Model archive created: {ARCHIVE_NAME}")



In [None]:
# 2️⃣ Launch TorchServe with DeepSpeed backend
# ------------------------------------------------------------
# We’ll spin up a single worker that uses DeepSpeed ZeRO‑3 and 8‑bit quantization.
# ------------------------------------------------------------
import os
import subprocess
import time

# TorchServe config files
TS_CONFIG = "config.properties"
TS_WORKER_CONFIG = "worker.properties"

# Write a minimal TorchServe config
with open(TS_CONFIG, "w") as f:
    f.write("model_store=model_store\n")
    f.write("inference_address=http://127.0.0.1:8080\n")
    f.write("management_address=http://127.0.0.1:8081\n")

# Create model store and move the .mar file
os.makedirs("model_store", exist_ok=True)
subprocess.run(["mv", "gpt_oss_20b.mar", "model_store/"], check=True)

# Write a worker config that enables DeepSpeed
with open(TS_WORKER_CONFIG, "w") as f:
    f.write("model_name=gpt-oss-20b\n")
    f.write("model_version=1.0\n")
    f.write("handler=handler\n")
    f.write("max_batch_delay=10\n")
    f.write("max_batch_size=8\n")
    f.write("device=cpu\n")  # TorchServe will move to GPU via DeepSpeed
    f.write("deepspeed_config=deepspeed_config.json\n")

# DeepSpeed config for inference
DS_CONFIG = "deepspeed_config.json"
with open(DS_CONFIG, "w") as f:
    f.write("{\n")
    f.write("  \"zero_optimization\": {\n")
    f.write("    \"stage\": 3,\n")
    f.write("    \"offload_optimizer\": {\n")
    f.write("      \"device\": \"cpu\",\n")
    f.write("      \"pin_memory\": true\n")
    f.write("    },\n")
    f.write("    \"offload_param\": {\n")
    f.write("      \"device\": \"cpu\",\n")
    f.write("      \"pin_memory\": true\n")
    f.write("    }\n")
    f.write("  },\n")
    f.write("  \"bf16\": {\n")
    f.write("    \"enabled\": true\n")
    f.write("  }\n")
    f.write("}\n")

# Start TorchServe
print("🚀 Starting TorchServe…")
serve_cmd = [
    "torchserve",
    "--start",
    "--model-store", "model_store",
    "--ts-config", TS_CONFIG,
    "--worker-config", TS_WORKER_CONFIG
]
proc = subprocess.Popen(serve_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
# Wait a bit for server to be ready
time.sleep(10)
print("✅ TorchServe is running.")

# 3️⃣ Test inference via REST API
import requests
payload = {"text": "Once upon a time, in a land far, far away"}
response = requests.post("http://127.0.0.1:8080/predictions/gpt-oss-20b", json=[payload])
print("Inference result:", response.json()[0])

# Clean up: stop TorchServe
proc.terminate()
proc.wait()
print("🛑 TorchServe stopped.")



## Step 6: Profiling, Optimization, and Advanced Use Cases

When you run GPT‑OSS‑20B, you’re basically asking a 20‑billion‑parameter brain to think. Just like a chef can’t cook a meal without knowing where the bottlenecks are, you can’t get the best performance out of the model without first *profiling* it. Profiling is the process of measuring how long each part of the code takes, how much GPU memory it uses, and where the model spends most of its time. Think of it as a detective that follows the model’s footsteps and tells you whether it’s stuck in a traffic jam (compute‑bound) or waiting for a slow elevator (memory‑bound).

### Extra explanatory paragraph

- **Profiler**: A tool that records timestamps, memory usage, and kernel launches for each operation. In PyTorch, `torch.profiler` is the standard choice.
- **Trace**: A chronological log of all operations executed during a run. Traces help you see which layers or functions consume the most time.
- **Batch size & sequence length**: Two knobs that directly influence compute and memory. Larger batches improve GPU utilization but increase memory; longer sequences increase the quadratic cost of self‑attention.
- **Compute‑bound vs Memory‑bound**: If the profiler shows most time spent in CUDA kernels, you’re compute‑bound. If most time is in memory transfers or waiting for data, you’re memory‑bound.
- **Optimization trade‑offs**: Techniques like `torch.compile` or `torch.backends.cudnn.benchmark` can speed up inference but may increase startup time or require more memory. Quantization reduces memory but can slightly hurt accuracy.

Below we’ll walk through a quick profiling session, interpret the results, and then apply a few optimization tricks that are safe for GPT‑OSS‑20B.



In [None]:
# 1️⃣ Profiling GPT‑OSS‑20B with torch.profiler
# ------------------------------------------------------------
# This cell measures execution time, memory, and kernel launches for a few generate calls.
# ------------------------------------------------------------
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Reproducibility
torch.manual_seed(42)

# Load model & tokenizer (cached from previous steps)
model_name = "gpt-oss-20b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
    token=os.getenv("HF_TOKEN"),
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=os.getenv("HF_TOKEN"))

# Prepare a single input
input_text = "The quick brown fox jumps over the lazy dog."
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Run a short profiling session
with torch.no_grad():
    with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
    ) as prof:
        for _ in range(5):
            _ = model.generate(**inputs, max_new_tokens=10)

# Print the top 10 operations by CPU time
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))



### Interpreting the profiler output

The table above lists the *top* operations by CPU time. For GPT‑OSS‑20B you’ll typically see:

| Rank | Operation | CPU Time (ms) | GPU Time (ms) | Notes |
|------|-----------|---------------|---------------|-------|
| 1 | `torch.cuda._lazy_call` | ~5 | ~3 | Kernel launch overhead |
| 2 | `torch.nn.functional.linear` | ~12 | ~8 | Matrix‑multiply in attention |
| 3 | `torch.nn.functional.gelu` | ~4 | ~3 | Activation function |

If the GPU time dominates, you’re compute‑bound and can try to speed up kernels (e.g., enable TF32, use `torch.backends.cudnn.benchmark=True`). If the CPU time is high, consider batching more or moving the profiler out of the critical path.

The `profile_memory=True` flag also shows peak memory per operation. If you see a spike in memory for `linear` layers, that’s the self‑attention matrix; you can mitigate it with 8‑bit quantization or by reducing `max_new_tokens`.

### Key takeaways

1. **Profiling is cheap** – a few seconds of overhead gives you a roadmap for optimization.
2. **Batch size vs sequence length** – a 2× larger batch often gives better GPU utilization, but the quadratic cost of attention can quickly exhaust memory.
3. **Memory‑bound vs compute‑bound** – choose the right knob: memory‑bound → quantization or model parallelism; compute‑bound → kernel tuning or `torch.compile`.



In [None]:
# 2️⃣ Optimizing inference with torch.compile and backend tweaks
# ------------------------------------------------------------
# These settings are safe for GPT‑OSS‑20B and often shave 10–20% off latency.
# ------------------------------------------------------------
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Reproducibility
torch.manual_seed(42)

# Enable high‑precision matmul and TF32 for faster CUDA kernels
torch.set_float32_matmul_precision("high")
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

# Load the model (cached)
model_name = "gpt-oss-20b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
    token=os.getenv("HF_TOKEN"),
)
model.eval()

# Compile with Inductor (PyTorch 2.0+)
compiled_model = torch.compile(model, backend="inductor")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=os.getenv("HF_TOKEN"))

# Run a quick generation to see the speedup
input_text = "Once upon a time,"
inputs = tokenizer(input_text, return_tensors="pt").to(compiled_model.device)

with torch.no_grad():
    output = compiled_model.generate(**inputs, max_new_tokens=30)
print("Generated:", tokenizer.decode(output[0], skip_special_tokens=True))



### What did we change?

| Change | Why it helps | Trade‑off |
|--------|---------------|-----------|
| `torch.set_float32_matmul_precision("high")` | Uses faster FP32 kernels on Ampere+ GPUs | Slightly higher precision cost, negligible for inference |
| `torch.backends.cudnn.benchmark=True` | Lets cuDNN pick the best algorithm for the current tensor sizes | Extra startup time, but negligible for repeated runs |
| `torch.backends.cuda.matmul.allow_tf32=True` | Enables TF32 matrix multiplication, ~2× faster on newer GPUs | Tiny loss in numerical precision, acceptable for language generation |
| `torch.compile(..., backend="inductor")` | Rewrites the model graph for optimal CUDA kernels | Longer warm‑up, but lower per‑token latency after a few runs |

**Bottom line:** These tweaks are *drop‑in* for most inference workloads. If you’re deploying at scale, combine them with DeepSpeed ZeRO‑3 or 8‑bit quantization for even larger memory savings.

### Advanced use case: dynamic batching

For real‑world APIs, you often receive requests of varying lengths. PyTorch’s `torch.compile` can handle dynamic shapes, but you may still want to group requests by sequence length to keep the GPU busy. A simple strategy is to maintain a *batch queue* that collects requests until a threshold is reached, then pads them to the longest sequence in the batch. This reduces the number of kernel launches and improves throughput.

---

**Next step:** We’ll wrap up with a quick recap of everything we’ve covered and outline future directions such as safety alignment and scaling to GPT‑OSS‑70B.



## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which component is NOT part of the GPT‑OSS‑20B transformer block?", ["Multi‑Head Self‑Attention","Layer Normalization","Convolutional Layer","Feed‑Forward Network"], 2, "GPT‑OSS‑20B uses only transformer‑style layers; it does not include convolutional layers.")


In [None]:
render_mcq("What is the primary benefit of using LoRA for fine‑tuning?", ["Reduces GPU memory usage","Enables full‑parameter updates","Increases training speed by 10×","Automatically tunes hyperparameters"], 0, "LoRA introduces low‑rank adapters that keep the base weights frozen, dramatically reducing memory footprint during fine‑tuning.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
