In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Advanced GPT‑OSS‑20B: Deep Dive into Architecture, Optimization, and Deployment

This notebook guides advanced practitioners through the intricacies of the GPT‑OSS‑20B model, covering model loading, prompt engineering, performance profiling, quantization, distributed training, and deployment. It balances theory with hands‑on code, emphasizing trade‑offs and expert considerations.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T02:47:43.131Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Explain the architectural choices behind GPT‑OSS‑20B and their impact on performance.
2. Demonstrate advanced prompt engineering techniques for high‑quality generation.
3. Profile and optimize inference using quantization, LoRA, and distributed inference.
4. Deploy a fine‑tuned GPT‑OSS‑20B model as a scalable API service.


## Prerequisites

- Python 3.10+
- Basic knowledge of PyTorch and Hugging Face Transformers


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 transformers>=4.40.0 accelerate>=0.28.0 bitsandbytes>=0.43.0 datasets>=2.20.0 torch>=2.2.0
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","transformers>=4.40.0","accelerate>=0.28.0","bitsandbytes>=0.43.0","datasets>=2.20.0","torch>=2.2.0"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


## Step 1: Environment Setup & Model Retrieval

Imagine you’re building a giant LEGO set. The instructions (our code) tell you which pieces (packages) you need, how to assemble them, and where to find the final model. In this notebook, we’ll first make sure our machine has the right LEGO bricks, then pull the GPT‑OSS‑20B model from Hugging Face’s online store.

### Why do we need these specific packages?
- **transformers** – the library that knows how to talk to GPT‑OSS‑20B.
- **accelerate** – lets us run the model on one or many GPUs without writing low‑level CUDA code.
- **bitsandbytes** – gives us fast 4‑bit quantization to shrink the model.
- **datasets** – a convenient way to load and preprocess data.
- **torch** – the deep‑learning engine.
- **ipywidgets** – optional, but useful for interactive demos.

### Key terms explained
- **Quantization**: converting 32‑bit floating‑point weights to 4‑bit integers. Think of it like compressing a high‑resolution photo to a smaller file size; you lose a bit of detail but the file becomes much lighter.
- **Seed**: a starting number for random number generators. Setting a seed guarantees that the same random choices (e.g., weight initialization) happen every run, so results are reproducible.
- **HF_TOKEN**: your personal Hugging Face authentication token. It unlocks private models and ensures you’re credited for downloads.

### Trade‑offs
Using 4‑bit quantization cuts GPU memory by ~80 % but can slightly increase perplexity (the model’s error rate). If you’re running on a single GPU with limited VRAM, the memory savings outweigh the small loss in accuracy. If you have ample memory and need the absolute best quality, you might skip quantization.

### Quick sanity check
Before we dive into code, make sure you have a CUDA‑enabled GPU (CUDA 12+ recommended) and that your environment variables are set:

```bash
export HF_TOKEN=YOUR_HF_TOKEN_HERE
```

Replace `YOUR_HF_TOKEN_HERE` with the token you copied from your Hugging Face account.

---

### What will happen in the code cells?
1. **Install** the required packages, handling any import errors.
2. **Import** the libraries, set a global random seed, and load the GPT‑OSS‑20B model with 4‑bit quantization.

Let’s get started!



In [None]:
# Cell 1: Install required packages
# We use a try/except block so that if a package is already installed, we skip re‑installing it.
# This keeps the notebook idempotent.

import subprocess, sys

packages = [
    "transformers>=4.40.0",
    "accelerate>=0.28.0",
    "bitsandbytes>=0.43.0",
    "datasets>=2.20.0",
    "torch>=2.2.0",
    "ipywidgets>=8.0.0"
]

for pkg in packages:
    try:
        __import__(pkg.split('>=')[0])
        print(f"{pkg} already installed")
    except ImportError:
        print(f"Installing {pkg}...")
        cmd = [sys.executable, "-m", "pip", "install", pkg]
        try:
            subprocess.check_call(cmd)
        except Exception as exc:
            if IN_COLAB:
                packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                if packages:
                    try:
                        import IPython
                        ip = IPython.get_ipython()
                        if ip is not None:
                            ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                        else:
                            import subprocess as _subprocess
                            _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                    except Exception as colab_exc:
                        print('⚠️ Colab pip fallback failed:', colab_exc)
                        raise
                else:
                    print('No packages specified for pip install; skipping fallback')
            else:
                raise

# Enable ipywidgets extension for interactive widgets (if running in Jupyter)
try:
    subprocess.check_call([sys.executable, "-m", "jupyter", "nbextension", "enable", "--py", "widgetsnbextension"])
except Exception as e:
    print("Could not enable widgetsnbextension (might be running in a non‑Jupyter environment).", e)



In [None]:
# Cell 2: Import, set seed, and load the model

import os
import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1️⃣ Set a global random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣ Verify that the HF_TOKEN is available
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise EnvironmentError("HF_TOKEN environment variable not set. Please export your Hugging Face token.")

# 3️⃣ Load tokenizer and model with 4‑bit quantization via bitsandbytes
MODEL_NAME = "EleutherAI/gpt-oss-20b"

print("Loading tokenizer...", end=" ")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("done")

print("Loading model (4‑bit quantized)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,          # use float16 for intermediate ops
    load_in_4bit=True,                 # enable 4‑bit quantization
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",                # automatically place layers on available GPUs
    use_auth_token=HF_TOKEN
)
print("model loaded successfully! GPU memory usage: {:.2f} GB".format(
    torch.cuda.memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0
))

# Quick sanity check: generate a short sentence
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated = model.generate(**inputs, max_new_tokens=20)
print("\nGenerated text:")
print(tokenizer.decode(generated[0], skip_special_tokens=True))



## Step 2: Understanding GPT‑OSS‑20B Architecture

Imagine a giant, multi‑layered sandwich where each layer is a tiny kitchen that knows how to transform the ingredients (the input tokens) into a richer, more flavorful dish. In GPT‑OSS‑20B, each layer is a *transformer block* that performs two main tasks:

1. **Self‑Attention** – the kitchen checks every ingredient against every other ingredient to decide how much each one should influence the others. Think of it as a group of chefs tasting each other’s dishes and adjusting seasoning accordingly.
2. **Feed‑Forward Network (FFN)** – after the tasting, each chef adds a final seasoning (a small neural net) to each ingredient.

These blocks are stacked 24 times, creating a deep, 20‑billion‑parameter model. The 24 layers are like 24 kitchens stacked on top of each other, each building on the flavor profile created by the previous one.

### Architectural Highlights
- **24 Transformer layers** – depth gives the model the ability to capture long‑range dependencies.
- **16 attention heads per layer** – each head learns a different way to look at the input, like having 16 chefs each focusing on a distinct flavor profile.
- **Hidden size 1,280** – the width of the internal representation; larger width means more expressive power but higher memory cost.
- **Intermediate size 5,120** – the size of the FFN’s hidden layer; this is where the “seasoning” happens.
- **Positional embeddings** – a way to tell the model where each token sits in the sequence, similar to giving each chef a numbered plate.
- **LayerNorm + Residual connections** – keep the signal stable and allow gradients to flow easily, like a safety net for the chefs.

### Key Terms & Trade‑offs
- **Self‑Attention**: Computes a weighted sum of all tokens for each token, allowing the model to capture relationships regardless of distance. It’s computationally expensive (O(n²) in sequence length) but essential for language understanding.
- **Feed‑Forward Network (FFN)**: A two‑layer MLP applied independently to each token. It adds non‑linearity and expands the representation.
- **LayerNorm**: Normalizes activations to stabilize training; it adds a small computational overhead but improves convergence.
- **Residual Connection**: Adds the input of a layer to its output, helping gradients flow and preventing vanishing gradients.
- **Parameter Count**: 20 B parameters ≈ 80 GB of FP16 memory. Quantization (4‑bit) reduces this to ~16 GB but may slightly degrade perplexity.

**Why 24 layers and 16 heads?**
- Depth (layers) allows the model to build hierarchical representations; each layer can capture increasingly abstract patterns.
- Width (hidden size) and number of heads control the capacity to represent diverse patterns. More heads mean the model can attend to more relationships in parallel, but each head is smaller, so the overall parameter count stays manageable.
- The chosen configuration balances GPU memory constraints (≈20 GB FP16) with the ability to model complex language tasks.

### Quick Code Peek
Below we load the model’s configuration and print a concise summary. This is a great way to verify that the architecture matches the documentation and to inspect any custom settings.



In [None]:
# Cell 1: Inspect GPT‑OSS‑20B architecture
# We import only the config to avoid loading the huge weights.
# This keeps the notebook lightweight and fast.

from transformers import AutoConfig

# Load the configuration for EleutherAI/gpt-oss-20b
config = AutoConfig.from_pretrained("EleutherAI/gpt-oss-20b")

# Print key hyperparameters
print("Model name:                 ", config._name_or_path)
print("Number of layers (n_layers):", config.n_layer)
print("Hidden size (n_embd):       ", config.n_embd)
print("Intermediate size (n_inner):", config.n_inner)
print("Number of attention heads:  ", config.n_head)
print("Total parameters:           ", config.num_parameters())

# Show a compact summary of the transformer blocks
print("\nTransformer block summary: ")
for i in range(config.n_layer):
    print(f"  Layer {i+1:02d}:  {config.n_head} heads, hidden {config.n_embd}, FFN {config.n_inner}")

# Optional: compute memory footprint for FP16 (approximate)
# 1 parameter = 2 bytes in FP16
bytes_per_param = 2
mem_gb = config.num_parameters() * bytes_per_param / (1024 ** 3)
print(f"\nApprox. FP16 memory: {mem_gb:.2f} GB")



### What We Learned
- The model has **24 layers** with **16 attention heads** each.
- Hidden size is **1,280** and the FFN expands to **5,120**.
- Total parameter count is ~20 B, which translates to ~40 GB in FP32 and ~20 GB in FP16.
- 4‑bit quantization cuts that down to ~16 GB, making it feasible on a single RTX‑3090.

Feel free to tweak the `config` object (e.g., `config.n_layer = 12`) to experiment with smaller variants, but remember that changing these values will require re‑training or fine‑tuning from scratch.



## Step 3: Loading the Model with Hugging Face

### Why do we need a special loading routine?
Think of the GPT‑OSS‑20B model as a gigantic library of books. Each book (layer) is stored on a different shelf (GPU). When you ask the model to generate text, you need to fetch the right books, read them, and then put them back. Hugging Face’s `AutoModelForCausalLM` is the librarian that knows how to locate, load, and hand out these books efficiently.

### The `from_pretrained` magic
```python
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-oss-20b",
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map="auto",
    use_auth_token=HF_TOKEN,
)
```
- **`torch_dtype=torch.float16`** – tells PyTorch to keep intermediate activations in half‑precision, which cuts memory by ~50 % compared to full 32‑bit.
- **`load_in_4bit=True`** – activates Bitsandbytes’ 4‑bit quantization. Imagine compressing a 32‑bit number to a 4‑bit nibble; you lose a little detail but the file shrinks dramatically.
- **`device_map="auto"`** – automatically shards the model across all available GPUs, so you don’t have to manually assign layers.
- **`use_auth_token`** – required for large or private models; it authenticates your Hugging Face account.

### What happens under the hood?
1. **Download** the model weights from the Hugging Face hub.
2. **Quantize** the weights to 4‑bit using Bitsandbytes.
3. **Shard** the quantized tensors across GPUs.
4. **Wrap** everything in a `CausalLM` head that can take token IDs and produce logits.

### Key terms & trade‑offs
- **Quantization**: Converting 32‑bit floating‑point weights to lower‑bit integers. It reduces memory and speeds up inference but can slightly hurt perplexity.
- **Device map**: A strategy for distributing tensors across devices. `"auto"` is convenient but may not balance load perfectly on heterogeneous GPUs.
- **Half‑precision (FP16)**: A compromise between speed and numerical stability. Some models still benefit from full FP32, but for GPT‑OSS‑20B FP16 is usually safe.
- **HF_TOKEN**: Your personal Hugging Face authentication token. Keep it secret; it grants access to large models and private repos.

### Why 4‑bit + FP16?
The 20‑B model would normally need ~20 GB of VRAM in FP16. With 4‑bit quantization, that drops to ~4 GB, making it runnable on a single RTX‑3090. The trade‑off is a marginal increase in perplexity (≈1–2 %) and a tiny drop in generation quality. For most inference workloads, the speed‑memory gain outweighs this cost.

### Quick sanity check
After loading, we’ll generate a short sentence to confirm everything works:
```python
prompt = "The future of AI is"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(generated[0], skip_special_tokens=True))
```
If you see a coherent continuation, the model is ready for the next steps.



In [None]:
# Cell 1: Load tokenizer and model with 4‑bit quantization
# Import required libraries
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Ensure reproducibility
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Verify HF_TOKEN
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise EnvironmentError("HF_TOKEN not found. Export your Hugging Face token before running this cell.")

# Load tokenizer
MODEL_NAME = "EleutherAI/gpt-oss-20b"
print("Loading tokenizer...", end=" ")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("done")

# Load model with 4‑bit quantization and automatic device mapping
print("Loading model (4‑bit, FP16)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
    use_auth_token=HF_TOKEN,
)
print("model loaded! GPU memory used: {:.2f} GB".format(
    torch.cuda.memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0
))

# Quick generation test
prompt = "The future of AI is"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated = model.generate(**inputs, max_new_tokens=20)
print("\nGenerated text:")
print(tokenizer.decode(generated[0], skip_special_tokens=True))



## Step 4: Tokenization & Prompt Engineering

### 1️⃣ What is tokenization?  
Think of a sentence as a long string of characters that a computer can’t understand directly. Tokenization is the process of chopping that string into *tokens*—small, meaningful units that the model can work with.  

- **Word‑level** tokenizers split on spaces, but they struggle with rare words.  
- **Sub‑word** tokenizers (like BPE or WordPiece) break words into smaller pieces, so even unseen words can be represented.  
- **Byte‑pair encoding (BPE)** starts with individual characters and repeatedly merges the most frequent pairs, creating a vocabulary that balances coverage and size.

In GPT‑OSS‑20B we use a BPE‑style tokenizer that maps each token to an integer ID.  The model sees a sequence of IDs, not raw text.

### 2️⃣ Prompt engineering  
A *prompt* is the text you give the model to steer its output.  It’s like giving a chef a recipe: the more precise you are, the more likely the dish will match your taste.  Prompt engineering is the art of crafting that recipe.

- **Prefix prompts**: “Translate the following sentence to French: …”  
- **Instruction prompts**: “You are a helpful assistant. Answer the question.”  
- **Few‑shot prompts**: Provide a few examples before the target input.

The goal is to reduce ambiguity, guide the model’s internal attention, and control the style or format of the output.

### 3️⃣ Sampling strategies & key terms  
When the model finishes the prompt, it must decide which token to output next.  Several knobs let you trade off creativity vs. determinism:

| Term | What it does | Typical values | Trade‑off |
|------|--------------|----------------|-----------|
| **Temperature** | Scales logits before softmax. Lower → more deterministic, higher → more random. | 0 (deterministic) – 1.5 (very creative) | 0 → safe but dull; 1+ → diverse but may hallucinate. |
| **Top‑p (nucleus sampling)** | Keeps the smallest set of tokens whose cumulative probability ≥ p. | 0.8–0.95 | 0.8 → more focused; 0.95 → more diverse. |
| **Repetition penalty** | Penalizes tokens that have already appeared. | 1.0 (none) – 1.5 | 1.0 → loops; >1 → discourages repetition. |
| **Length penalty** | Adjusts score based on sequence length (used in beam search). | 0.8–1.2 | 0.8 → favors shorter outputs; 1.2 → encourages longer. |
| **Beam width** | Number of parallel hypotheses kept during generation. | 1–10 | 1 → greedy; >1 → more exhaustive search. |

### 4️⃣ Why these knobs matter  
- **Deterministic generation** (temperature = 0) is great for debugging or when you need reproducible results.  
- **Higher temperature** or **top‑p** introduces variety, useful for creative writing or sampling many candidates.  
- **Repetition penalty** is essential for long‑form generation; without it the model can get stuck in loops.  
- **Beam search** with a moderate width (e.g., 5) often yields higher‑quality text than greedy decoding, but at the cost of speed.

Choosing the right combination depends on the task: a chatbot may favor low temperature for safety, while a story generator may use higher temperature for flair.

### 5️⃣ Extra explanatory paragraph: key terms & rationale/trade‑offs
**Token** – the smallest unit the model processes; it’s an integer ID that maps to a sub‑word or character.  Tokens are the building blocks of the input and output sequences.  

**Tokenizer** – the algorithm that turns raw text into tokens and back.  It must be consistent between training and inference; otherwise the model will misinterpret the prompt.  

**Prompt** – the text you feed to the model to elicit a desired response.  A well‑crafted prompt reduces ambiguity and guides the model’s attention.  

**Sampling strategy** – the method the model uses to pick the next token.  Deterministic strategies (temperature = 0, greedy) guarantee reproducibility but can produce bland or repetitive text.  Stochastic strategies (temperature > 0, top‑p) increase diversity but risk incoherence or hallucinations.  

**Trade‑offs** – Every knob balances speed, memory, and output quality.  For instance, a larger beam width improves quality but slows generation and consumes more GPU memory.  Lower temperature reduces randomness but may lead to repetitive or overly safe responses.  The art of prompt engineering is to find the sweet spot for your specific use case.

---

### 6️⃣ Quick hands‑on demo
Below we’ll load the tokenizer and model, then generate the same prompt with different sampling settings to see how the output changes.  The code is intentionally short (≤30 lines) and fully reproducible.



In [None]:
# Tokenization & Prompt Engineering demo
# ------------------------------------------------------------
# 1️⃣ Imports & reproducibility
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

SEED = 123
torch.manual_seed(SEED)

# 2️⃣ Load tokenizer & model (reuse if already loaded)
MODEL_NAME = "EleutherAI/gpt-oss-20b"
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Loading model (4‑bit, FP16)…")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
)

# 3️⃣ Helper to generate text
def generate(prompt, **gen_kwargs):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, **gen_kwargs)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# 4️⃣ Base prompt
prompt = "Explain the concept of tokenization in simple terms:"

# 5️⃣ Different sampling strategies
print("\nDeterministic (temperature=0):")
print(generate(prompt, temperature=0, max_new_tokens=50))

print("\nTemperature 0.8 (more creative):")
print(generate(prompt, temperature=0.8, max_new_tokens=50))

print("\nTop‑p 0.9 (nucleus sampling):")
print(generate(prompt, temperature=0.8, top_p=0.9, max_new_tokens=50))

print("\nRepetition penalty 1.2 (avoid loops):")
print(generate(prompt, temperature=0.8, repetition_penalty=1.2, max_new_tokens=50))



## Step 5: Baseline Generation & Evaluation

### Why do we need a baseline?
Think of the GPT‑OSS‑20B model as a chef who can cook an endless variety of dishes. Before we start tweaking ingredients (prompt engineering, temperature, etc.), we want a *reference plate* that shows what the chef normally produces. This reference plate is our **baseline** – it lets us measure how much a change improves or hurts the final dish.

### What we’ll do in this section
1. **Generate a small set of baseline responses** for a handful of prompts.
2. **Quantitatively evaluate** those responses using BLEU and ROUGE, two classic metrics from machine‑translation research.
3. **Add a quick human‑in‑the‑loop sanity check** so we can see if the numbers match what a person would think.

### Key terms & trade‑offs
- **BLEU (Bilingual Evaluation Understudy)** – a precision‑oriented metric that counts how many *n‑gram* overlaps exist between the generated text and a reference. It’s fast and easy to compute but can be overly harsh on creative language and doesn’t capture meaning.
- **ROUGE (Recall-Oriented Understudy for Gisting Evaluation)** – a family of recall‑based metrics (ROUGE‑L, ROUGE‑N, ROUGE‑S) that measure how much of the reference content appears in the generated text. It tends to reward longer outputs and can be more forgiving than BLEU.
- **Human‑in‑the‑loop** – a manual scoring step where a human annotator rates the quality (fluency, relevance, factuality). It’s the gold standard but expensive and slow.
- **Trade‑offs** – BLEU is quick but may penalize novel phrasing; ROUGE is more tolerant but can over‑reward verbosity. Human scores are accurate but noisy and costly. In practice, we combine all three to get a balanced view.

### Why baseline matters
If you tweak temperature from 0.7 to 1.2 and BLEU drops from 0.45 to 0.38, you know the change hurt precision. If ROUGE jumps, maybe the model is adding more content but also more hallucinations. Human scores help confirm whether the numbers reflect real quality.

---

### Quick sanity check
Below we’ll generate five responses for each prompt, compute BLEU/ROUGE against a short reference, and print the results. The code is split into two cells for clarity and reproducibility.



In [None]:
# Cell 1: Generate baseline responses
# ------------------------------------------------------------
# 1️⃣ Imports & reproducibility
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣ Load tokenizer & model (reuse if already loaded)
MODEL_NAME = "EleutherAI/gpt-oss-20b"
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Loading model (4‑bit, FP16)…")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
)

# 3️⃣ Prompts and reference answers (short, hand‑crafted)
prompts = [
    "Explain the concept of tokenization in simple terms.",
    "What is the difference between supervised and unsupervised learning?",
    "Describe the process of photosynthesis.",
]

references = [
    "Tokenization is the process of breaking text into smaller pieces called tokens, which the model can understand.",
    "Supervised learning uses labeled data, while unsupervised learning finds patterns in unlabeled data.",
    "Photosynthesis is how plants convert sunlight into energy, producing glucose and oxygen.",
]

# 4️⃣ Generate 5 samples per prompt
generated_outputs = []
for idx, prompt in enumerate(prompts):
    print(f"\nPrompt {idx+1}: {prompt}")
    for i in range(5):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        out = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            do_sample=True,
        )
        text = tokenizer.decode(out[0], skip_special_tokens=True)
        generated_outputs.append(text)
        print(f"  Sample {i+1}: {text}")

print("\nBaseline generation complete.")



In [None]:
# Cell 2: Compute BLEU & ROUGE, add human‑in‑the‑loop placeholder
# ------------------------------------------------------------
# 1️⃣ Install & import evaluation library (if not already installed)
try:
    import evaluate
except ImportError:
    import subprocess, sys
    cmd = [sys.executable, "-m", "pip", "install", "evaluate==0.4.2"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise
    import evaluate

# 2️⃣ Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# 3️⃣ Prepare references and predictions for each prompt
# The evaluate library expects lists of tokenized sentences
predictions = []
reference_lists = []

# Re‑use the prompts and references from the previous cell
for ref in references:
    reference_lists.append([ref.split()])  # list of token lists

# Flatten generated_outputs into a list of token lists
for gen in generated_outputs:
    predictions.append(gen.split())

# 4️⃣ Compute metrics
bleu_score = bleu.compute(predictions=predictions, references=reference_lists)
rouge_score = rouge.compute(predictions=predictions, references=reference_lists)

print("\n=== Automatic Evaluation ===")
print(f"BLEU score: {bleu_score['bleu']:.4f}")
print(f"ROUGE‑L F1: {rouge_score['rougeL_fmeasure']:.4f}")
print(f"ROUGE‑1 F1: {rouge_score['rouge1_fmeasure']:.4f}")
print(f"ROUGE‑2 F1: {rouge_score['rouge2_fmeasure']:.4f}")

# 5️⃣ Human‑in‑the‑loop placeholder
print("\n=== Human Evaluation (manual) ===")
print("Please read each generated sample and rate it on a 1–5 scale for: fluency, relevance, factuality.")
print("(In a real workflow, you would collect these scores in a spreadsheet or annotation tool.)")



## Step 6: Profiling Inference Performance

When you run a large language model like GPT‑OSS‑20B, you’re essentially asking a giant factory to produce text. Just like a factory, you want to know which machines (layers, attention heads, matrix multiplications) are the bottlenecks, how much memory each part consumes, and how long the whole process takes. 

In this section we’ll use **PyTorch’s built‑in profiler** to answer those questions. Think of the profiler as a stopwatch that also records how many workers (threads) are busy, how much memory is being used, and which parts of the code are the slowest. With that data you can decide whether to:

* Move a layer to a faster GPU, 
* Reduce the batch size, 
* Quantize a sub‑module, or 
* Parallelize across multiple GPUs.

### Key terms & trade‑offs
- **CPU vs GPU**: The CPU is like a single worker that can do many different tasks, but it’s slower for matrix math. The GPU is a team of workers that can do the same math in parallel, but it has limited memory.
- **Latency**: The time it takes to generate a single token. Lower latency means a snappier user experience.
- **Throughput**: How many tokens you can generate per second. Higher throughput is important for batch inference or serving many users.
- **Memory footprint**: The amount of VRAM used while the model is running. If you exceed VRAM, the GPU will swap to slower system RAM or crash.
- **Profiler overhead**: Profiling adds a small amount of extra work (e.g., recording timestamps). For very short runs the overhead can dominate, so we’ll run a few warm‑up steps before measuring.

**Why profiling matters**: Without it you might spend hours tweaking hyper‑parameters only to discover that the real bottleneck is a single attention head that is not on the fastest GPU. Profiling gives you a data‑driven map of where to focus your optimization efforts.

### What we’ll do
1. Load the model (if not already loaded) and set a deterministic seed.
2. Warm‑up the model with a few dummy generations.
3. Run the profiler while generating a short prompt.
4. Print a concise summary of the most time‑consuming operations and memory usage.

Feel free to adjust the `max_new_tokens` or `batch_size` to match your own workload.



In [None]:
# Cell 1: Setup – imports, seed, and model loading (re‑use if already loaded)
# ------------------------------------------------------------
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1️⃣ Set a global seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣ Load tokenizer & model if not already present
MODEL_NAME = "EleutherAI/gpt-oss-20b"
try:
    tokenizer
except NameError:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
try:
    model
except NameError:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        load_in_4bit=True,
        device_map="cuda:0" if torch.cuda.is_available() else "cpu",
        use_auth_token=os.getenv("HF_TOKEN"),
    )

print("Model and tokenizer ready – GPU memory used: {:.2f} GB".format(
    torch.cuda.memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0
))



In [None]:
# Cell 2: Profiling inference with torch.profiler
# ------------------------------------------------------------
import torch.profiler
from torch.profiler import profile, record_function, ProfilerActivity

# 3️⃣ Warm‑up: run a few generations without profiling
warmup_prompt = "Hello world"
for _ in range(3):
    inputs = tokenizer(warmup_prompt, return_tensors="pt").to(model.device)
    model.generate(**inputs, max_new_tokens=10)

# 4️⃣ Define a simple generation function to profile

def generate_once(prompt, max_new_tokens=20):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with record_function("model_inference"):
        out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# 5️⃣ Run profiler
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler("runs/profiler_oss20b"),
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
) as prof:
    for step in range(5):  # 5 steps to get a stable profile
        generate_once("The quick brown fox jumps over the lazy dog.", max_new_tokens=15)
        prof.step()

# 6️⃣ Print a concise summary
print("\n=== Profiling Summary ===")
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))
print("\nMemory usage (GPU): {:.2f} MB".format(
    torch.cuda.memory_allocated() / (1024 ** 2) if torch.cuda.is_available() else 0
))



## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which of the following best describes the trade‑off when applying 4‑bit quantization to GPT‑OSS‑20B?", ["Increased memory usage with higher accuracy","Reduced memory usage with a slight drop in accuracy","No change in memory usage but improved speed","Significant accuracy loss with no memory benefit"], 1, "4‑bit quantization reduces memory footprint by ~80% but can slightly degrade model perplexity.")


In [None]:
render_mcq("What is the primary advantage of using LoRA for fine‑tuning large language models?", ["It eliminates the need for GPU memory","It adds a large number of trainable parameters","It keeps the majority of the model frozen while adding few trainable weights","It converts the model to a smaller architecture"], 2, "LoRA introduces a small set of trainable rank‑decomposition matrices, keeping most of the pre‑trained weights frozen.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
