In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Deploying and Fine‑Tuning GPT‑OSS‑20B for Research Applications

This lesson guides advanced practitioners through the end‑to‑end workflow of deploying the GPT‑OSS‑20B model, from environment setup to efficient fine‑tuning on custom corpora. It covers architectural trade‑offs, memory‑efficient inference, and best practices for reproducibility in research settings.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T03:23:43.221Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Explain the architectural differences between GPT‑OSS‑20B and other large‑language‑model variants.
2. Configure a GPU‑optimized environment that supports 20B‑parameter inference and training.
3. Implement memory‑efficient fine‑tuning using gradient checkpointing and 8‑bit optimizers.
4. Evaluate model performance on domain‑specific benchmarks and document reproducibility metadata.


## Prerequisites

- Python 3.10+ with pip
- PyTorch 2.0+ (CUDA 11.8 or higher)
- Basic familiarity with Hugging Face Transformers and Jupyter notebooks
- Access to a GPU with ≥24 GB VRAM


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 torch==2.0.0+cu118 transformers==4.40.0 accelerate==0.24.0 bitsandbytes==0.41.0 datasets==2.20.0
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","torch==2.0.0+cu118","transformers==4.40.0","accelerate==0.24.0","bitsandbytes==0.41.0","datasets==2.20.0"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


## Step 1: Environment Validation and GPU Profiling

Before we can play with a 20‑billion‑parameter model, we need to make sure our playground (the GPU) is ready to host it. Think of the GPU as a giant kitchen: the model is a huge recipe that requires a lot of space (memory) and a steady supply of ingredients (compute). If the kitchen is too small or the stove is too weak, the recipe will burn.

In this step we will:

1. **Verify the software stack** – confirm that the correct versions of PyTorch, CUDA, and Hugging Face libraries are installed.
2. **Check GPU availability** – ensure that the GPU we intend to use is visible to PyTorch and that its memory capacity meets the 20B requirement.
3. **Profile memory usage** – run a quick inference pass with a small prompt to see how much VRAM is actually consumed.
4. **Set reproducibility seeds** – lock down random number generators so that experiments can be repeated exactly.

### Key Terms Explained

- **CUDA**: NVIDIA’s parallel computing platform. It turns the GPU into a fast math engine.
- **VRAM**: Video RAM, the memory that lives on the GPU. Large models need a lot of VRAM to store parameters and intermediate activations.
- **Gradient Checkpointing**: A technique that trades extra computation for lower memory usage by recomputing certain activations during back‑propagation.
- **8‑bit Quantization**: Reducing the precision of model weights from 32‑bit floats to 8‑bit integers, cutting memory by ~4× while keeping accuracy largely intact.
- **Reproducibility Seed**: A fixed integer that initializes all random number generators (Python, NumPy, PyTorch) so that the same sequence of random numbers is produced every run.

### Why These Checks Matter

- **Avoid Runtime Crashes**: If the GPU is not detected or the VRAM is insufficient, the notebook will crash mid‑run, wasting time.
- **Performance Tuning**: Knowing the exact memory footprint lets us decide whether to enable gradient checkpointing or 8‑bit quantization.
- **Scientific Rigor**: Reproducibility seeds ensure that results can be verified by others, a cornerstone of research.

### Trade‑offs

- **Speed vs Memory**: Enabling gradient checkpointing or 8‑bit quantization reduces memory but increases compute time. For quick prototyping, you might skip checkpointing; for full‑scale fine‑tuning, you’ll need it.
- **Precision vs Efficiency**: 8‑bit quantization saves memory but can introduce a tiny drop in accuracy. For most research tasks, the trade‑off is negligible.

Now let’s put these ideas into practice with a couple of short code snippets.



In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1️⃣  Verify software versions and GPU visibility
# ──────────────────────────────────────────────────────────────────────
import torch, transformers, accelerate, bitsandbytes, datasets

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"Bitsandbytes version: {bitsandbytes.__version__}")
print(f"Datasets version: {datasets.__version__}")

# Check GPU count and properties
gpu_count = torch.cuda.device_count()
print(f"Number of GPUs detected: {gpu_count}")
for i in range(gpu_count):
    prop = torch.cuda.get_device_properties(i)
    print(f"GPU {i}: {prop.name}, VRAM: {prop.total_memory / (1024**3):.2f} GB")

# ──────────────────────────────────────────────────────────────────────
# 2️⃣  Quick VRAM profiling with a tiny prompt
# ──────────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForCausalLM

# Use a small 8‑bit quantized model for the test
model_name = "TheBloke/GPT-OSS-20B-GPTQ-4bit-128g"
# Note: replace with your actual 8‑bit checkpoint if different

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load model with 8‑bit quantization (requires bitsandbytes)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
    load_in_8bit=True,
    torch_dtype=torch.float16,
)

# Simple prompt
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Run inference and measure peak memory
torch.cuda.reset_peak_memory_stats()
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10)
peak_mem = torch.cuda.max_memory_allocated() / (1024**3)
print(f"Peak VRAM used for inference: {peak_mem:.2f} GB")

# ──────────────────────────────────────────────────────────────────────
# 3️⃣  Set reproducibility seeds
# ──────────────────────────────────────────────────────────────────────
import random, numpy as np
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
print(f"Reproducibility seed set to {seed}")



## Step 2: Loading GPT‑OSS‑20B with 8‑bit Quantization

Imagine you have a gigantic library (the 20‑billion‑parameter model) that you want to read on a small laptop (your GPU). The library is too big to fit in the laptop’s memory, so you decide to shrink each book by a factor of four: you keep only the most important words and drop the rest. That’s what **8‑bit quantization** does – it compresses the model weights from 32‑bit floating‑point numbers to 8‑bit integers, cutting the memory footprint by roughly 75 % while keeping the story (the model’s predictions) almost unchanged.

In this step we’ll:

1. **Pull the tokenizer** – the piece of software that turns text into numbers the model can understand.
2. **Load the model with 8‑bit weights** – using the `bitsandbytes` library, which hooks into PyTorch to perform the compression on the fly.
3. **Map the model to the GPU** – automatically splitting the layers across available devices so we don’t run out of VRAM.
4. **Verify the memory savings** – by running a tiny inference pass and printing the peak VRAM usage.

### Key Terms Explained

- **8‑bit Quantization**: Converting 32‑bit floating‑point weights to 8‑bit integers. This reduces memory usage by ~4× but introduces a small quantization error.
- **bitsandbytes**: A PyTorch extension that implements efficient 8‑bit (and 4‑bit) weight loading and inference, plus fast matrix multiplication kernels.
- **device_map**: A dictionary that tells Hugging Face Transformers which GPU each layer should live on. Setting it to "auto" lets the library decide the best placement.
- **torch_dtype**: The data type used for activations during inference. Using `torch.float16` keeps compute fast while still being accurate enough for most tasks.
- **Peak VRAM**: The maximum amount of GPU memory allocated at any point during a run. Monitoring this helps ensure we stay within hardware limits.

### Why 8‑bit Quantization Matters

- **Memory Efficiency**: A 20‑B model normally needs 80 GB of VRAM in 32‑bit precision. With 8‑bit quantization, we can fit it on a single 24‑GB GPU.
- **Speed**: 8‑bit kernels in bitsandbytes are often faster than 32‑bit ones because they use less memory bandwidth.
- **Accuracy Trade‑off**: The quantization error is usually <1 % in perplexity for large language models, making it acceptable for research and many production scenarios.

### Trade‑offs to Keep in Mind

| Aspect | 32‑bit | 8‑bit | What to Expect |
|--------|--------|-------|----------------|
| VRAM | ~80 GB | ~20 GB | Huge savings |
| Compute | Baseline | Slightly faster (less memory traffic) | Good for inference |
| Accuracy | Gold standard | Minor drop (often <1 %) | Acceptable for most tasks |
| Implementation | Simple | Requires bitsandbytes | Add a dependency |

Now let’s see how to do this in code.



In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1️⃣  Load tokenizer and 8‑bit GPT‑OSS‑20B
# ──────────────────────────────────────────────────────────────────────
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Replace with the exact checkpoint you want to use
MODEL_NAME = "TheBloke/gpt-oss-20b-gptq-4bit-128g"

# Load tokenizer (fast tokenizer is usually faster)
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Load model with 8‑bit weights via bitsandbytes
print("Loading model with 8‑bit quantization…")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",          # automatically place layers on GPUs
    load_in_8bit=True,          # enable 8‑bit quantization
    torch_dtype=torch.float16   # use FP16 activations for speed
)

print("Model loaded on device:", next(model.parameters()).device)

# ──────────────────────────────────────────────────────────────────────
# 2️⃣  Quick VRAM profiling with a tiny prompt
# ──────────────────────────────────────────────────────────────────────
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(next(model.parameters()).device) for k, v in inputs.items()}

torch.cuda.reset_peak_memory_stats()
with torch.no_grad():
    _ = model.generate(**inputs, max_new_tokens=10)
peak_mem = torch.cuda.max_memory_allocated() / (1024**3)
print(f"Peak VRAM used for inference: {peak_mem:.2f} GB")



## Step 3: Memory‑Efficient Inference with Gradient Checkpointing

### Why do we need it?
When you run a 20‑B model on a single 24‑GB GPU, the forward pass alone can eat up most of the memory. Think of the model as a giant Lego set: each block (layer) needs to sit on the table (GPU memory) while you build the next block. If the table is too small, you have to keep moving blocks around, which slows you down. Gradient checkpointing is like a clever Lego‑stacking trick: you only keep a few blocks on the table at a time and rebuild the rest on the fly when you need them. The trade‑off is a bit more time, but you can fit the whole set on a smaller table.

### How it works in PyTorch
PyTorch’s `torch.utils.checkpoint` module lets you wrap a function (e.g., a transformer block) so that its intermediate activations are *not* stored during the forward pass. When the backward pass (or a manual recomputation) is triggered, the function is re‑executed to regenerate those activations. For inference, we can force a recomputation after each block to keep memory low, at the cost of extra compute.

### Key Terms Explained
- **Checkpointing**: The act of discarding intermediate activations during the forward pass and recomputing them later.
- **Recomputation**: Running the same forward function again to regenerate activations that were not stored.
- **Peak VRAM**: The maximum amount of GPU memory allocated at any point during a run.
- **`torch.utils.checkpoint.checkpoint`**: A helper that automatically handles the discard‑and‑recompute logic.
- **`accelerate` gradient‑checkpointing**: A higher‑level wrapper that applies checkpointing to all layers of a Hugging Face model.

### Rationale & Trade‑offs
| Aspect | 32‑bit / No Checkpointing | With Checkpointing |
|--------|---------------------------|--------------------|
| Memory | Uses full activations (≈20 GB for 8‑bit GPT‑OSS‑20B) | Stores only a few activations (≈5–10 GB) |
| Compute | Baseline | Extra forward passes (≈2× slower) |
| Accuracy | Unchanged | Unchanged (exact recomputation) |
| Complexity | Simple | Requires wrapping layers or using `accelerate` |

In research settings where GPU memory is the bottleneck, checkpointing is a lifesaver. For quick prototyping, you might skip it to save time.

### Quick Code Demo
Below we load the same 8‑bit GPT‑OSS‑20B model from Step 2 and wrap its transformer blocks with `torch.utils.checkpoint`. We also set a reproducibility seed and measure peak VRAM.



In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1️⃣  Reproducibility & imports
# ──────────────────────────────────────────────────────────────────────
import random, numpy as np, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.checkpoint import checkpoint

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
print(f"Reproducibility seed set to {SEED}")

# ──────────────────────────────────────────────────────────────────────
# 2️⃣  Load tokenizer & 8‑bit model (same as Step 2)
# ──────────────────────────────────────────────────────────────────────
MODEL_NAME = "TheBloke/gpt-oss-20b-gptq-4bit-128g"
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("Loading model with 8‑bit quantization…")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
    load_in_8bit=True,
    torch_dtype=torch.float16,
)

# ──────────────────────────────────────────────────────────────────────
# 3️⃣  Wrap transformer blocks with checkpointing
# ──────────────────────────────────────────────────────────────────────
# Hugging Face stores blocks in model.model.decoder.layers
layers = model.model.decoder.layers
for i, layer in enumerate(layers):
    # Replace the forward method with a checkpointed version
    orig_forward = layer.forward
    def chkpt_forward(*args, **kwargs):
        return checkpoint(orig_forward, *args, **kwargs)
    layer.forward = chkpt_forward
print("Applied checkpointing to all decoder layers.")

# ──────────────────────────────────────────────────────────────────────
# 4️⃣  Quick VRAM profiling with a tiny prompt
# ──────────────────────────────────────────────────────────────────────
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(next(model.parameters()).device) for k, v in inputs.items()}

torch.cuda.reset_peak_memory_stats()
with torch.no_grad():
    _ = model.generate(**inputs, max_new_tokens=10)
peak_mem = torch.cuda.max_memory_allocated() / (1024**3)
print(f"Peak VRAM used for inference with checkpointing: {peak_mem:.2f} GB")



## Step 4: Preparing Domain‑Specific Dataset with Hugging Face Datasets

When you want a language model to speak like a cardiologist, a lawyer, or a software engineer, you need to give it a *diet* of text that reflects that specialty. Think of the model as a sponge that absorbs water (text). If you pour in only kitchen recipes, it will never learn how to talk about heart disease. The Hugging Face `datasets` library is the kitchen sink that lets you fetch, clean, and feed the right kind of water into the sponge.

In this step we’ll:

1. **Pull a domain‑specific corpus** from the Hugging Face Hub or a local file.
2. **Split** it into training, validation, and test sets while preserving class balance.
3. **Tokenize** the raw text with the same tokenizer we used to load GPT‑OSS‑20B.
4. **Cache** the processed dataset for fast reuse.
5. **Inspect** a few examples to sanity‑check the pipeline.

### Key Terms Explained

- **Dataset**: A collection of data points (e.g., sentences, paragraphs) that the model will learn from.
- **Tokenization**: The process of converting raw text into a sequence of integer IDs that the model can understand.
- **Streaming**: Loading data lazily from disk or the internet to avoid memory overload.
- **Cache**: Storing the processed dataset on disk so that subsequent runs skip the expensive preprocessing step.
- **Shuffling**: Randomly reordering the data to prevent the model from learning spurious order effects.
- **Batching**: Grouping multiple examples together to take advantage of GPU parallelism.

### Why Domain‑Specific Datasets Matter

- **Relevance**: The model learns the vocabulary, style, and facts that are most useful for your target audience.
- **Bias Mitigation**: By curating the data, you can reduce unwanted stereotypes or misinformation.
- **Performance**: Fine‑tuning on a focused corpus often yields higher accuracy on downstream tasks than generic data.

### Trade‑offs to Keep in Mind

| Decision | Memory | Speed | Quality |
|----------|--------|-------|---------|
| **Full‑text tokenization** | High (tokens are stored) | Slower (more passes) | Highest (no loss of context) |
| **Chunking** (e.g., 512‑token windows) | Lower | Faster | Slightly lower (context truncated) |
| **Streaming + on‑the‑fly tokenization** | Minimal | Fastest | Depends on implementation |

In research, we usually favor full‑text tokenization with caching because it preserves the richest context while still being reproducible.

### Extra Explanatory Paragraph

The `datasets` library abstracts away many of the pain points of data handling. Internally it represents a dataset as a lazy, columnar structure that can be filtered, mapped, and split with declarative syntax. When you call `dataset.map(tokenize_function, batched=True)`, the library automatically parallelizes the operation across CPU cores and writes the results to a cache directory (`~/.cache/huggingface/datasets`). This means that the next time you run the notebook, the heavy tokenization step is skipped, saving you minutes or hours. The trade‑off is that the cache can grow large (hundreds of MBs), so you should monitor disk usage or clean the cache (`datasets.cleanup_cache_files()`) when necessary.

Now let’s turn theory into practice with a couple of short code cells.



In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1️⃣  Load a domain‑specific dataset (e.g., PubMed abstracts)
# ──────────────────────────────────────────────────────────────────────
import os
from datasets import load_dataset, DatasetDict

# Example: PubMed abstracts from the Hugging Face Hub
# Replace with your own dataset path or name if needed
DATASET_NAME = "allenai/pubmed_abstracts"

# Load the dataset with streaming to avoid memory overload
print("Loading dataset…")
dataset = load_dataset(DATASET_NAME, split="train", streaming=True)

# Convert to a finite Dataset (we’ll take the first 10k examples for demo)
print("Collecting 10,000 examples…")
train_data = dataset.take(10000)
train_dataset = DatasetDict({"train": train_data})

# Split into train/validation (90/10)
print("Splitting into train/validation…")
train_split, val_split = train_dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset = DatasetDict({"train": train_split, "validation": val_split})

print(f"Train size: {len(train_dataset['train'])}")
print(f"Validation size: {len(train_dataset['validation'])}")



In [None]:
# ──────────────────────────────────────────────────────────────────────
# 2️⃣  Tokenize the dataset with the GPT‑OSS‑20B tokenizer
# ──────────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer
import torch

# Load the same tokenizer used for the model
MODEL_NAME = "TheBloke/gpt-oss-20b-gptq-4bit-128g"
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Define a tokenization function
def tokenize_function(examples):
    # The dataset column is "abstract" for PubMed; adjust if yours differs
    return tokenizer(examples["abstract"], truncation=True, max_length=512)

# Apply tokenization in batched mode for speed
print("Tokenizing… (this may take a minute)")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["abstract"],  # keep only tokenized fields
    num_proc=4,  # parallelize across 4 CPU cores
    load_from_cache_file=True,  # reuse cached results if available
)

# Verify the first example
print("First tokenized example:")
print(train_dataset['train'][0])

# Save the processed dataset for later reuse
CACHE_DIR = os.path.expanduser("~/.cache/huggingface/datasets/processed_pubmed")
print(f"Saving processed dataset to {CACHE_DIR}")
train_dataset.save_to_disk(CACHE_DIR)



## Step 5: Configuring Accelerate for Multi‑GPU Fine‑Tuning

Imagine you’re a conductor leading an orchestra that spans several concert halls (GPUs). Each hall has its own acoustics (memory, compute), but you want the music (training) to sound seamless across all of them. Hugging Face **Accelerate** is the conductor’s score: it tells each hall when to play, how many instruments to assign, and how to share the sheet music (model parameters) so that the whole symphony stays in tune.

In this step we’ll:

1. **Create an `accelerate` configuration** that tells the library how many GPUs to use, what precision to run in, and whether to enable gradient checkpointing.
2. **Generate a minimal training script** that imports the configuration, loads the 8‑bit GPT‑OSS‑20B, and starts distributed training.
3. **Launch the training** with `accelerate launch`, which automatically handles device placement, mixed‑precision, and logging.
4. **Verify reproducibility** by setting seeds and checking that the same random numbers are produced on every run.

By the end of this section you’ll have a ready‑to‑run training pipeline that scales across multiple GPUs without writing any boilerplate distributed‑training code.


### Key Terms Explained

- **Accelerate**: A lightweight library that abstracts away the complexities of distributed training, mixed‑precision, and device placement.
- **`accelerate config`**: An interactive CLI that writes a YAML file (`accelerate_config.yaml`) describing the training environment (number of processes, GPUs, precision, etc.).
- **Gradient Checkpointing**: A memory‑saving technique that recomputes activations during back‑propagation instead of storing them.
- **Mixed‑Precision (FP16/FP8)**: Using lower‑precision arithmetic for activations and gradients to reduce memory bandwidth while maintaining model accuracy.
- **Distributed Data Parallel (DDP)**: A PyTorch strategy that replicates the model on each GPU and synchronizes gradients automatically.
- **Reproducibility Seed**: A fixed integer that seeds all random number generators (Python, NumPy, PyTorch) to ensure deterministic behavior.

### Why Accelerate Matters

1. **Zero Boilerplate** – You don’t need to write `torch.distributed.init_process_group()` or manually wrap your model with `DistributedDataParallel`.
2. **Flexibility** – The same configuration works for single‑GPU, multi‑GPU, or even multi‑node setups.
3. **Performance** – Accelerate automatically selects the best precision (FP16 or BF16) for your GPU and handles gradient accumulation.
4. **Reproducibility** – By setting seeds in both the script and the config, you can guarantee that the same training run produces identical results.

### Trade‑offs to Keep in Mind

| Feature | Memory | Speed | Complexity |
|---------|--------|-------|------------|
| Gradient Checkpointing | ↓ | ↑ | Moderate (requires `gradient_checkpointing=True`) |
| Mixed‑Precision | ↓ | ↑ | Low (handled by Accelerate) |
| Distributed Data Parallel | ↓ (per‑GPU) | ↑ | Low (handled by Accelerate) |
| Full‑Precision | ↑ | ↓ | None |

In research, the combination of gradient checkpointing + mixed‑precision + DDP gives you the best memory‑speed trade‑off for a 20‑B model on a 24‑GB GPU.


### Extra Explanatory Paragraph

The `accelerate` configuration file is essentially a recipe that tells the training script how to orchestrate the orchestra. Internally, Accelerate parses the YAML, sets up the `accelerate.Accelerator` object, and injects the correct device map, precision, and distributed backend. When you run `accelerate launch train.py`, the CLI spawns one process per GPU, each process loads the same script, but the `Accelerator` ensures that each process only sees its own GPU and that gradients are synchronized across all processes. This design keeps the code simple while still leveraging the full power of multi‑GPU training.


In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1️⃣  Create an accelerate config file programmatically
# ──────────────────────────────────────────────────────────────────────
import os
from accelerate import Accelerator

# Define the config dictionary
config = {
    "compute_environment": "LOCAL_MACHINE",
    "deepspeed_config": None,
    "distributed_type": "MULTI_GPU",
    "fp16": True,  # use mixed‑precision
    "bf16": False,
    "zero_stage": 0,  # no ZeRO optimization
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "log_with": "tensorboard",
    "logging_dir": "./logs",
    "mixed_precision": "fp16",
    "num_processes": 2,  # adjust to your GPU count
    "process_index": 0,
    "use_cpu": False,
    "use_mps": False,
    "use_cuda": True,
    "use_torch_distributed": True
}

# Write to accelerate_config.yaml
config_path = "accelerate_config.yaml"
with open(config_path, "w") as f:
    import yaml
    yaml.safe_dump(config, f)

print(f"Accelerate config written to {config_path}")

# ──────────────────────────────────────────────────────────────────────
# 2️⃣  Minimal training script (train.py)
# ──────────────────────────────────────────────────────────────────────
# Save this as train.py in the same directory as the config file.

# Note: This is a minimal example; for full training you’ll need a data loader, optimizer, etc.

import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from accelerate import Accelerator

# Reproducibility seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Initialize accelerator
accelerator = Accelerator()

# Load tokenizer and 8‑bit model
MODEL_NAME = "TheBloke/gpt-oss-20b-gptq-4bit-128g"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
    load_in_8bit=True,
    torch_dtype=torch.float16,
)

# Dummy dataset: replace with your processed dataset
class DummyDataset(torch.utils.data.Dataset):
    def __len__(self): return 1000
    def __getitem__(self, idx):
        text = "Once upon a time"
        enc = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(), "labels": enc["input_ids"].squeeze()}

train_dataset = DummyDataset()

# Prepare training arguments (Accelerator will override some settings)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    dataloader_num_workers=0,
)

# Wrap everything with accelerator
model, train_dataset, training_args = accelerator.prepare(
    model, train_dataset, training_args
)

# Trainer (you can also use a custom training loop)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()

# Save the fine‑tuned model
trainer.save_model("./fine_tuned_gpt_oss_20b")

print("Training complete and model saved.")


## Section 6

Thinking...
>We need to produce JSON for section 6. Must follow structure: section_number 6, title "Step 6: Fine‑Tuning with 8‑bit AdamW and Mixed Precision". Content: array of cells: markdown and code. Must be 800-1000 tokens. Provide callouts array. estimated_tokens 1000. prerequisites_check array. next_section_hint.
>
>We need to produce content for fine-tuning with 8-bit AdamW and mixed precision. Use beginner-friendly ELI5 language with analogies, but precise technical terms. Add one extra ...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which optimizer is most suitable for 8‑bit fine‑tuning on a 20B model?", ["AdamW","SGD","RMSprop","Adagrad"], 0, "AdamW with 8‑bit precision balances memory efficiency and convergence speed for large‑parameter models.")


In [None]:
render_mcq("What is the primary benefit of gradient checkpointing?", ["Reduces GPU memory usage","Speeds up inference","Increases model accuracy","Simplifies data preprocessing"], 0, "Gradient checkpointing trades compute for memory, enabling training of very large models on limited GPU resources.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
