In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Deploying and Fine‑Tuning GPT‑Oss‑20B in Jupyter: A Practitioner’s Guide

This notebook walks experienced ML practitioners through the end‑to‑end process of loading, configuring, and fine‑tuning the 20B‑parameter GPT‑Oss model using Hugging Face libraries and LoRA adapters. It emphasizes practical setup, GPU memory management, and reproducible training workflows.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T03:15:47.070Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Understand the architecture and tokenization of GPT‑Oss‑20B.
2. Load the model from the Hugging Face Hub and configure GPU memory efficiently.
3. Apply LoRA adapters for lightweight fine‑tuning on custom datasets.
4. Evaluate the fine‑tuned model and interpret training metrics.


## Prerequisites

- Python 3.10+ with pip
- Basic knowledge of PyTorch and Hugging Face Transformers
- Access to a GPU with at least 24 GB VRAM


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 torch>=2.0.0 transformers>=4.40.0 accelerate>=0.28.0 datasets>=2.20.0 peft>=0.6.0
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","torch>=2.0.0","transformers>=4.40.0","accelerate>=0.28.0","datasets>=2.20.0","peft>=0.6.0"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


## Step 1: Introduction and Setup

Welcome to the first step of our journey to fine‑tune the 20‑billion‑parameter GPT‑Oss model. Think of GPT‑Oss as a gigantic library of text—each book is a *parameter* that helps the model understand language. Fine‑tuning is like giving that library a new set of bookmarks so it can focus on a specific topic faster.

In this section we will:

1. **Verify the environment** – make sure you have the right Python version, GPU, and the Hugging Face token.
2. **Install the required libraries** – `torch`, `transformers`, `accelerate`, `datasets`, `peft`, and `ipywidgets`.
3. **Set a random seed** – reproducibility is the bread‑and‑butter of ML experiments.
4. **Check GPU availability** – we’ll confirm that the notebook can see your 24 GB GPU.

### Why these steps matter

* **Reproducibility** – By fixing the random seed and using pinned library versions, you can share your notebook and others will get the same results.
* **Memory management** – GPT‑Oss is huge; we’ll pre‑configure PyTorch to use the GPU efficiently.
* **Error handling** – Early checks prevent the frustrating “module not found” or “no GPU” errors that can stall a notebook.

#### Key terms

- **Parameter** – a weight in the neural network; GPT‑Oss has 20 B of them.
- **LoRA** – Low‑Rank Adaptation, a lightweight method that adds a small number of trainable matrices to the model.
- **HF_TOKEN** – your Hugging Face authentication token that grants access to private models.
- **Accelerate** – a library that abstracts device placement and mixed‑precision training.

#### Trade‑offs

- **Pinned versions** guarantee consistency but may miss out on bug fixes or performance improvements in newer releases.
- **Setting a seed** can slightly reduce randomness, which is good for debugging but may hide stochastic effects that could be useful in some research settings.

Let’s get started!



In [None]:
# 1️⃣ Install required packages (run once)
# We use pip to install the exact versions that work with GPT‑Oss‑20B.
# The `--quiet` flag keeps the output tidy.
# If you already have the packages, pip will skip re‑installing.

import subprocess, sys

packages = [
    "torch>=2.0.0",
    "transformers>=4.40.0",
    "accelerate>=0.28.0",
    "datasets>=2.20.0",
    "peft>=0.6.0",
    "ipywidgets>=8.0.0"
]

for pkg in packages:
    subprocess.run([sys.executable, "-m", "pip", "install", pkg, "--quiet"], check=True)

# Enable Jupyter widgets (only needed once per environment)
try:
    subprocess.run([sys.executable, "-m", "jupyter", "nbextension", "enable", "--py", "widgetsnbextension"], check=True)
except Exception as e:
    print("Widget extension already enabled or failed to enable:", e)



In [None]:
# 2️⃣ Verify environment and set seed
import os
import random
import numpy as np
import torch

# Set a fixed random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Print environment info
print("Python version:", sys.version)
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")

# Check Hugging Face token
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
    print("⚠️  HF_TOKEN environment variable not found.\n\tSet it with: export HF_TOKEN=YOUR_TOKEN")
else:
    print("✅  HF_TOKEN found.")



### What you should see

- The Python and PyTorch versions should match the pinned ones.
- CUDA should report `True` and list your GPU name (e.g., NVIDIA RTX 4090).
- A message confirming that `HF_TOKEN` is set.

If any of these checks fail, pause the notebook and resolve the issue before moving on.



## Step 2: Model Architecture Overview

Think of GPT‑Oss‑20B as a **tower of bookshelves**. Each shelf holds a *layer* of the model, and each book on that shelf is a *parameter* that helps the model read and write text. The tower is built from the same type of shelf over and over, but the number of shelves (layers) and the size of each shelf (hidden dimension) vary.

### 1️⃣ What makes up a transformer layer?

| Component | Role | Analogy |
|-----------|------|---------|
| **Self‑Attention** | Lets the model look at every word in the sentence and decide which words matter most for predicting the next word. | A group of students in a classroom pointing at each other to decide who should speak next. |
| **Feed‑Forward Network (FFN)** | Applies a small neural net to each word’s representation, adding non‑linearity. | A tiny calculator that tweaks each student’s idea before it’s shared. |
| **LayerNorm** | Normalizes the activations so the network stays stable. | A teacher making sure everyone’s volume is at a comfortable level. |
| **Residual Connection** | Adds the input of the layer back to its output, helping gradients flow. | A safety net that keeps the original idea intact while the student refines it. |

The **GPT‑Oss‑20B** architecture stacks **48** of these identical shelves. Each shelf has a hidden size of **12,288** and uses **96** attention heads. The total number of parameters is roughly **20 billion**, which is why we need a powerful GPU and careful memory management.

### 2️⃣ Why this design?

* **Depth (48 layers)** gives the model a long “memory” of past tokens, enabling it to capture complex language patterns.
* **Width (12,288 hidden units)** allows each layer to hold a rich representation of the input.
* **Large number of heads (96)** lets the model attend to many different relationships simultaneously.

Trade‑offs:

- **More layers** → better performance but higher memory and compute cost.
- **Wider layers** → richer representations but also more parameters per layer.
- **More heads** → finer-grained attention but increased parallelism overhead.

### 3️⃣ Quick sanity check in code

Below we load the model configuration (no weights yet) and print a concise summary. This helps you confirm that the architecture matches the documentation before you start downloading the 20 B weights.

```python
# 1️⃣ Load the config without downloading the huge weights
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,  # GPT‑Oss uses custom code on HF
    use_auth_token=os.getenv("HF_TOKEN"),
)

# 2️⃣ Print key hyper‑parameters
print("Model name:", config.model_type)
print("Number of layers (n_layer):", config.n_layer)
print("Hidden size (n_embd):", config.n_embd)
print("Number of attention heads (n_head):", config.n_head)
print("Total parameters (approx):", config.num_parameters())

# 3️⃣ Visualize the layer structure (simple text diagram)
print("\nLayer diagram:\n")
for i in range(config.n_layer):
    print(f"Layer {i+1:02d}: Attention + FFN + LayerNorm + Residual")
```

> **⚠️ Note**: The `AutoConfig` call only pulls the configuration file, which is tiny (~10 KB). The actual weights are ~80 GB and will be downloaded when you instantiate the model.

### 4️⃣ Key terms defined

- **Transformer**: A neural network architecture that relies on self‑attention to process sequences.
- **Self‑Attention**: Mechanism that lets each token weigh every other token in the sequence.
- **Feed‑Forward Network (FFN)**: A two‑layer MLP applied to each token’s representation.
- **LayerNorm**: Normalization technique that stabilizes training by scaling activations.
- **Residual Connection**: Adds the input of a layer to its output, aiding gradient flow.
- **HF_TOKEN**: Hugging Face authentication token required to download private or large models.

### 5️⃣ Why we expose the config early

By inspecting the config before loading weights, you:

1. **Avoid wasted bandwidth** – if the config shows a mismatch (e.g., wrong number of layers), you can stop early.
2. **Verify reproducibility** – the config contains the exact hyper‑parameters used by the authors.
3. **Plan memory** – knowing `n_embd` and `n_layer` lets you estimate VRAM usage.

---

**Next step**: In Step 3 we’ll actually load the GPT‑Oss‑20B weights onto the GPU, taking care to keep memory usage in check.



In [None]:
# Quick sanity check: print a few token embeddings
# This demonstrates that the model can be instantiated without downloading all weights
from transformers import AutoModel

# Load the model lazily (weights will be streamed as needed)
model = AutoModel.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=os.getenv("HF_TOKEN"),
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",  # let accelerate decide placement
    offload_folder="/tmp/torch_offload",  # optional: offload to disk if GPU memory is tight
)

print("Model loaded on device:", next(model.parameters()).device)

# Inspect the first token embedding (just to confirm shapes)
import torch
sample_input = torch.tensor([[50256]])  # BOS token id
with torch.no_grad():
    outputs = model(sample_input)
print("Output shape:", outputs.last_hidden_state.shape)
```



## Step 3: Loading GPT‑Oss‑20B from Hugging Face

In the previous step we peeked at the model’s blueprint. Now it’s time to bring the full 20‑billion‑parameter engine to life. Think of this as assembling a giant Lego set: the instructions (config) are tiny, but the bricks (weights) are massive. We’ll use Hugging Face’s `AutoModelForCausalLM` to pull the weights, but we’ll also give the system a few hints so it doesn’t crash the GPU.

### 3️⃣1️⃣ Why we use `device_map="auto"`

When you ask the library to load a model, it normally tries to put everything on the first GPU. For a 20‑B model that would require ~80 GB of VRAM—more than most single‑GPU setups provide. The `device_map="auto"` flag tells the `accelerate` backend to split the model across available GPUs or, if only one GPU is present, to stream layers in and out of memory. It’s like having a warehouse that can move boxes in and out as needed, rather than trying to store the entire shipment in a single shelf.

### 3️⃣2️⃣ Optional off‑loading to disk

If you’re on a machine with a single 24 GB GPU, you can still run the model by off‑loading the heaviest layers to disk. The `offload_folder` argument creates a temporary directory where those layers are stored when not in use. This trades a bit of CPU‑disk I/O for the ability to run the model without exceeding VRAM limits.

### 3️⃣3️⃣ Trusting remote code

GPT‑Oss ships a custom model class that isn’t part of the standard Transformers distribution. Setting `trust_remote_code=True` allows the library to download and execute that custom class. Think of it as trusting a friend’s custom recipe that isn’t on the official cookbook.

### 3️⃣4️⃣ Reproducibility and safety

We’ll set a deterministic seed for PyTorch before loading the model to ensure that any stochastic operations (e.g., dropout during evaluation) behave consistently across runs. We’ll also catch common errors such as missing `HF_TOKEN` or insufficient GPU memory.

### 3️⃣5️⃣ Key terms and trade‑offs

- **`device_map`** – a dictionary that tells Accelerate which GPU each layer should live on. `"auto"` automatically shards the model.
- **`offload_folder`** – a path on disk where layers are temporarily stored when not on GPU.
- **`trust_remote_code`** – allows the library to load custom model definitions from the Hugging Face Hub.
- **`torch_dtype`** – the data type used for weights (e.g., `torch.float16` for mixed‑precision). Using lower precision saves memory but can slightly degrade numerical stability.
- **Trade‑offs** – Sharding (`device_map`) reduces VRAM usage but increases inter‑GPU communication overhead. Off‑loading saves VRAM but incurs disk I/O latency. Mixed‑precision (`torch_dtype`) speeds up inference but may introduce rounding errors.

### 3️⃣6️⃣ Quick sanity check

Below we load the full model with the recommended settings and print the device each parameter resides on. This gives you a quick visual confirmation that the model is correctly sharded or off‑loaded.



In [None]:
# 1️⃣ Import required libraries
import os
import torch
from transformers import AutoModelForCausalLM, AutoConfig

# 2️⃣ Set a deterministic seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 3️⃣ Verify HF_TOKEN is available
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
    raise EnvironmentError(
        "HF_TOKEN not found. Set it with export HF_TOKEN=YOUR_TOKEN or os.environ['HF_TOKEN']=..."
    )

# 4️⃣ Load the configuration first (tiny file, ~10 KB)
config = AutoConfig.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
)
print("Model config loaded: n_layer={}, n_embd={}, n_head={}".format(
    config.n_layer, config.n_embd, config.n_head
))

# 5️⃣ Load the full model with sharding and optional off‑load
model = AutoModelForCausalLM.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",          # automatically shard across GPUs
    offload_folder="/tmp/torch_offload",  # optional: offload to disk if needed
    torch_dtype=torch.float16,   # mixed‑precision to save memory
)

print("\nModel loaded. Checking device placement:")
# Show a summary of where each layer lives
for name, param in model.named_parameters():
    print(f"{name:60s} -> {param.device}")

# 6️⃣ Quick forward pass to confirm everything works
sample_input = torch.tensor([[50256]])  # BOS token id
with torch.no_grad():
    outputs = model(sample_input)
print("\nForward pass successful. Output shape:", outputs.logits.shape)



### 3️⃣7️⃣ Common pitfalls

- **GPU memory exhausted** – If you still hit OOM errors, try setting `device_map={"transformer.h.0": "cpu"}` to off‑load the first few layers, or increase the `offload_folder` size.
- **Slow disk I/O** – Off‑loading can be slow on HDDs. Use an SSD or a RAM disk for best performance.
- **Mixed‑precision errors** – Some models may produce NaNs when using `float16`. Switch to `torch.float32` if you encounter this.

### 3️⃣8️⃣ Next step

In Step 4 we’ll dive deeper into GPU and memory management, showing how to fine‑tune the model with LoRA adapters while keeping VRAM usage under control.



## Step 4: GPU and Memory Management

When you load a 20‑billion‑parameter model, the GPU becomes the model’s *living room*. If you don’t manage the space, the room will overflow and the model will crash. Think of the GPU as a tiny apartment that can hold only a few heavy books (tensors). The trick is to keep the books organized, move some to a storage unit (disk), and sometimes use lighter versions of the books (mixed‑precision) so the apartment stays comfortable.

### 1️⃣ Why GPU memory matters

- **VRAM is limited** – a single 24 GB GPU can’t hold all 80 GB of weights at once.
- **Large tensors consume memory** – each forward pass allocates new tensors that sit on the GPU until they’re freed.
- **Memory fragmentation** – repeated allocations can leave gaps that the GPU can’t use efficiently.

### 2️⃣ Key tools for memory hygiene

| Tool | What it does | Analogy |
|------|--------------|---------|
| `torch.cuda.memory_summary()` | Prints a concise report of allocated, reserved, and free memory. | A quick snapshot of how full your apartment is. |
| `torch.cuda.set_per_process_memory_fraction()` | Caps the fraction of GPU memory a process can claim. | A budget that prevents overspending on furniture. |
| `torch.cuda.empty_cache()` | Frees unused memory back to the GPU pool. | Clearing out the trash bin after a party. |
| `torch.backends.cudnn.deterministic` | Forces deterministic convolution algorithms. | Choosing a single, predictable recipe. |
| `torch.backends.cudnn.benchmark` | Enables auto‑tuning for speed (may be nondeterministic). | Letting the kitchen auto‑adjust heat for fastest cooking. |
| `torch.backends.cuda.matmul.allow_tf32` | Enables TensorFloat‑32 for faster matrix ops on Ampere+ GPUs. | Using a lighter, faster version of the same ingredient. |
| `torch.autocast` | Mixed‑precision context manager (float16/float32). | Using lightweight paper instead of heavy cardboard for temporary storage. |
| `torch.compile` | Compiles a model for speed (PyTorch 2.0+). | Pre‑assembling furniture to save assembly time. |

### 3️⃣ Trade‑offs to keep in mind

- **Determinism vs. Speed** – Setting `cudnn.deterministic=True` guarantees reproducible results but can slow down training.
- **Mixed‑precision vs. Accuracy** – `float16` saves memory and speeds up inference, but may introduce small numerical errors.
- **Off‑loading vs. I/O latency** – Storing heavy layers on disk frees VRAM but can slow down training if the disk is slow.
- **Memory cap vs. OOM risk** – Limiting memory usage protects against crashes, but setting the cap too low may cause out‑of‑memory errors during large batches.
- **Compilation vs. Overhead** – `torch.compile` can accelerate models, but the compilation step may temporarily increase memory usage.

### 4️⃣ Practical checklist before you start training

1. **Verify GPU availability** – `torch.cuda.is_available()`.
2. **Check VRAM** – `torch.cuda.get_device_properties(0).total_memory`.
3. **Set a memory cap** – `torch.cuda.set_per_process_memory_fraction(0.9)`.
4. **Enable mixed‑precision** – `torch.autocast` or `torch.compile`.
5. **Monitor memory** – `torch.cuda.memory_summary()` before and after key operations.
6. **Clean up** – `torch.cuda.empty_cache()` after large tensors are no longer needed.

By following these steps, you’ll keep your GPU from bursting at the seams and ensure that fine‑tuning GPT‑Oss‑20B runs smoothly.



In [None]:
# 1️⃣ Import torch and set deterministic behavior for reproducibility
import torch

# Deterministic convolutions (slower but reproducible)
torch.backends.cudnn.deterministic = True
# Disable auto‑tuning to keep results stable across runs
torch.backends.cudnn.benchmark = False

# 2️⃣ Enable TensorFloat‑32 for faster matmul on Ampere+ GPUs (optional)
torch.backends.cuda.matmul.allow_tf32 = True

# 3️⃣ Limit per‑process GPU memory to 90% of the device (adjust if you hit OOM)
torch.cuda.set_per_process_memory_fraction(0.9, device=0)

# 4️⃣ Show current memory status
print("Initial memory summary:")
print(torch.cuda.memory_summary(device=0, abbreviated=True))

# 5️⃣ Dummy model to illustrate memory usage
class DummyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(1024, 1024).cuda()
    def forward(self, x):
        return self.linear(x)

model = DummyModel()

# 6️⃣ Mixed‑precision inference with autocast
x = torch.randn(8, 1024).cuda()
with torch.autocast(device_type="cuda", dtype=torch.float16):
    out = model(x)

print("\nAfter forward pass:")
print(torch.cuda.memory_summary(device=0, abbreviated=True))

# 7️⃣ Optional: compile the model for speed (PyTorch 2.0+)
if hasattr(torch, "compile"):
    compiled = torch.compile(model)
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        out = compiled(x)
    print("\nCompiled model inference done.")

# 8️⃣ Clean up to free memory for the next cell
torch.cuda.empty_cache()



## Step 5: Fine‑Tuning with LoRA Adapters

Fine‑tuning a 20‑billion‑parameter model on a single GPU is like trying to teach a giant library to write a new style of story. The library is too big to move around, so we add a *lightweight* set of bookmarks (the LoRA adapters) that only tweak a tiny fraction of the books. These bookmarks are small matrices that sit on top of the original weights and learn the new style while the rest of the library stays untouched.

### Why LoRA?  A quick analogy

Imagine you have a massive LEGO set (the base model). Building a new structure from scratch would mean buying a whole new set. LoRA is like giving you a handful of extra LEGO bricks that you can snap onto the existing set to change its shape. You don’t need to rebuild the entire set; you just add a few pieces.

### Key terms and trade‑offs

| Term | What it means | Trade‑off | Rationale |
|------|----------------|-----------|-----------|
| **LoRA (Low‑Rank Adaptation)** | Adds two small trainable matrices (A and B) to each attention and MLP layer. | **Fewer trainable params** (≈0.1 % of the base model) | Saves GPU memory and speeds up training while preserving most of the original knowledge. |
| **Adapter rank (r)** | Size of the hidden dimension in the LoRA matrices. | **Higher r → more capacity** but **more memory** | Choose r based on dataset size and GPU limits. |
| **Alpha (α)** | Scaling factor applied to the LoRA update. | **Higher α → stronger updates** but can destabilize training | Helps balance learning speed and stability. |
| **Gradient checkpointing** | Recomputes intermediate activations during back‑prop to save memory. | **More compute** but **less VRAM** | Essential when training large models with limited GPU memory. |
| **Mixed‑precision (float16/float32)** | Uses lower‑precision arithmetic for speed and memory. | **Potential numerical instability** | Combined with LoRA, it keeps memory low while maintaining accuracy. |

### What we’ll do in this section

1. **Load the base GPT‑Oss‑20B model** and tokenizer.
2. **Wrap the model with LoRA adapters** using the `peft` library.
3. **Prepare a tiny synthetic dataset** (you can replace it with your own). 
4. **Set up a minimal training loop** that trains only the LoRA weights.
5. **Show how to save and reload the fine‑tuned adapters**.

All code cells are kept under 30 lines and include comments for clarity. We also set a deterministic seed and pin library versions for reproducibility.



In [None]:
# 1️⃣  Imports and reproducibility
import os, random
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import Dataset

# Pin versions for reproducibility
assert torch.__version__.startswith("2.")

# Set a fixed random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣  Load tokenizer and base model (weights are on GPU via device_map="cuda:0" if torch.cuda.is_available() else "cpu")
HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN is not None, "HF_TOKEN not set"

tokenizer = AutoTokenizer.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
)

base_model = AutoModelForCausalLM.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",          # shard across GPUs
    torch_dtype=torch.float16,   # mixed‑precision
)

# 3️⃣  Define LoRA configuration
lora_cfg = LoraConfig(
    r=8,                # rank of the low‑rank matrices
    lora_alpha=32,      # scaling factor
    target_modules=["q_proj", "v_proj", "fc1", "fc2"],  # attention & MLP layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# 4️⃣  Wrap the model with LoRA adapters
model = get_peft_model(base_model, lora_cfg)
print("LoRA adapters added. Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# 5️⃣  Create a tiny synthetic dataset (replace with real data)
sample_texts = [
    "Hello world! This is a test sentence.",
    "Fine‑tuning GPT‑Oss with LoRA is efficient.",
    "We can add more data later.",
]

dataset = Dataset.from_dict({"text": sample_texts})

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]).with_format("torch")

# 6️⃣  Prepare DataLoader
from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# 7️⃣  Optimizer only for LoRA params
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# 8️⃣  Minimal training loop (2 epochs)
model.train()
for epoch in range(2):
    for batch in loader:
        inputs = {k: v.cuda() for k, v in batch.items() if k != "idx"}
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} finished. Loss: {loss.item():.4f}")

# 9️⃣  Save only the LoRA adapters
model.save_pretrained("./gpt-oss-20b-lora")
print("LoRA adapters saved to ./gpt-oss-20b-lora")



### Loading the fine‑tuned adapters later

```python
from peft import PeftModel

# Load the base model again (weights stay on disk)
base = AutoModelForCausalLM.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Attach the saved LoRA weights
model = PeftModel.from_pretrained(base, "./gpt-oss-20b-lora")

# Now you can generate text or continue training
```

This lightweight approach lets you keep the massive 20‑B backbone on disk and only load the tiny LoRA weights into GPU memory when needed.



## Step 6: Preparing Dataset and Tokenizer

Before we can teach GPT‑Oss‑20B a new style, we need to give it a *menu* of sentences to read. Think of the dataset as a cookbook: each recipe (text example) is a small paragraph that the model will learn from. The tokenizer is the chef’s knife that chops each recipe into bite‑sized *tokens* (words, sub‑words, or punctuation) so the model can understand them.

### Why this step matters

1. **Tokenization is the language of the model** – GPT‑Oss expects integer IDs, not raw text. If the tokenizer is wrong, the model will read gibberish.
2. **Padding and truncation keep the batch shape consistent** – GPUs love tensors that are all the same size. Padding adds a special *pad* token where needed, while truncation cuts off overly long sentences.
3. **Dataset split gives us a validation set** – we can monitor over‑fitting and decide when to stop training.
4. **Collation bundles the data into a format the model understands** – the `DataCollatorForLanguageModeling` automatically creates the `labels` field that the loss function uses.

### Key terms and trade‑offs

| Term | What it means | Trade‑off | Rationale |
|------|----------------|-----------|-----------|
| **Dataset** | A collection of text examples (e.g., Wikipedia articles). | Larger datasets improve generalization but increase storage and preprocessing time. | Use a realistic corpus that matches your target domain. |
| **Tokenizer** | Converts text to integer IDs. | Fast tokenizers (e.g., `GPT2TokenizerFast`) use a pre‑compiled C++ backend, but may have a larger vocabulary. | Choose a tokenizer that matches the model’s training tokenizer for best compatibility. |
| **Padding** | Adds a special token to make all sequences the same length. | Padding increases memory usage but simplifies batching. | Use `padding='max_length'` for deterministic batch shapes. |
| **Truncation** | Cuts sequences longer than `max_length`. | Truncation can lose context but keeps memory usage bounded. | Set `max_length` to a value that balances context length and GPU memory. |
| **Batch size** | Number of examples processed together. | Larger batches improve GPU utilization but require more VRAM. | Start with a small batch and scale up if memory allows. |
| **DataCollator** | Helper that prepares the final input tensors for the model. | Custom collators can add special handling (e.g., causal masking). | Use `DataCollatorForLanguageModeling` for standard causal LM training. |

### Practical checklist

1. **Set a deterministic seed** – ensures reproducible shuffling and tokenization.
2. **Verify the tokenizer matches the model** – use `AutoTokenizer.from_pretrained("gpt-oss-20b")`.
3. **Choose a dataset** – for demo we’ll use the `wikitext-2-raw-v1` split; replace it with your own data later.
4. **Tokenize with `batched=True`** – speeds up preprocessing.
5. **Split into train/validation** – `train_test_split(test_size=0.1)`.
6. **Create a DataCollator** – handles padding and label creation.
7. **Wrap in a DataLoader** – set `shuffle=True` for training.

With this foundation, the next step (Step 7) will show how to feed these batches into the LoRA‑augmented GPT‑Oss‑20B and monitor training progress.



In [None]:
# 1️⃣  Imports and reproducibility
import os
import random
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# Pin library versions for reproducibility
assert torch.__version__.startswith("2.")

# Set a fixed random seed for deterministic shuffling & tokenization
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣  Load the tokenizer that matches GPT‑Oss‑20B
HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN is not None, "HF_TOKEN not set"

tokenizer = AutoTokenizer.from_pretrained(
    "gpt-oss-20b",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
)

# Ensure the tokenizer has a pad token (GPT‑2 style models use eos as pad)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 3️⃣  Load a small public dataset for demonstration
# Replace this with your own data for real fine‑tuning
raw_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# 4️⃣  Tokenize the dataset in batches
max_length = 128  # keep sequences short to fit GPU memory

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

tokenized_ds = raw_ds.map(tokenize_function, batched=True, remove_columns=["text"])  # <30 lines

# 5️⃣  Split into train/validation
train_ds, val_ds = tokenized_ds.train_test_split(test_size=0.1, seed=SEED).values()

# 6️⃣  Create a DataCollator that adds the labels field
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 7️⃣  Prepare DataLoaders for training and validation
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_ds,
    batch_size=4,  # adjust based on GPU memory
    shuffle=True,
    collate_fn=collator,
)

val_loader = DataLoader(
    val_ds,
    batch_size=4,
    shuffle=False,
    collate_fn=collator,
)

print("Dataset ready: ", len(train_ds), "train examples,", len(val_ds), "validation examples")
print("Batch shape example:")
for batch in train_loader:
    print(batch["input_ids"].shape, batch["labels"].shape)
    break



In [None]:
# 8️⃣  Quick sanity check: run a forward pass on a batch
# (Assumes you have a LoRA‑wrapped model from Step 5 loaded as `model`)
# If you don’t have a model yet, this cell will just demonstrate the data pipeline.

# Uncomment the following lines if you have a model ready:
# model.eval()
# with torch.no_grad():
#     for batch in train_loader:
#         inputs = {k: v.cuda() for k, v in batch.items() if k != "idx"}
#         outputs = model(**inputs)
#         print("Loss:", outputs.loss.item())
#         break

print("Data pipeline is functional. Ready for training in Step 7.")



## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which library provides the LoRA implementation used in this notebook?", ["transformers","peft","datasets","accelerate"], 1, "The peft library supplies lightweight adapters such as LoRA for efficient fine‑tuning of large language models.")


In [None]:
render_mcq("What is the primary benefit of using LoRA adapters?", ["Faster inference","Lower memory usage","Higher accuracy","None of the above"], 1, "LoRA reduces the number of trainable parameters, thereby lowering memory consumption while maintaining performance.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
