In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Getting Started with GPT-OSS 20B: A Beginner's Guide

This lesson walks absolute beginners through everything needed to run the GPT-OSS 20B model locally. You’ll learn how to set up the environment, load the model, and run it in interactive notebooks with simple examples, all explained with everyday analogies and hands‑on exercises.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T03:51:37.448Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Understand what GPT-OSS 20B is and why it matters.
2. Set up a reproducible Jupyter environment with ipywidgets for demos.
3. Load the 20B model and run a basic text generation.
4. Identify common pitfalls and best practices for working with large language models.


## Prerequisites

- Basic Python knowledge (no deep learning required).
- Access to a machine with an NVIDIA GPU or sufficient CPU memory.


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q python>=3.10 pip>=23.1 git ipynb ipywidgets>=8.0.0
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["python>=3.10","pip>=23.1","git","ipynb","ipywidgets>=8.0.0"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


# Step 1: Meet GPT‑OSS 20B

Welcome to the first step of your journey with GPT‑OSS 20B! Think of GPT‑OSS as a gigantic library of sentences that has read almost every book, article, and webpage on the internet. The *20B* part tells you how many “words” (more precisely, *parameters*) it has memorized—20 billion of them. That’s a lot of memory, but it also means the model can understand and generate text that feels surprisingly human.

## Why 20B matters
- **Scale vs. Speed**: A larger model usually gives better, more nuanced answers, but it also needs more GPU memory and takes longer to run. 20B is a sweet spot for many developers: it’s powerful enough for creative writing, code generation, and conversation, yet still fits on a single high‑end GPU (12 GB+).
- **Fine‑tuning friendliness**: With 20B, you can fine‑tune on a small dataset (a few thousand lines) and get a specialized bot without needing a super‑cluster.

## Key terms
| Term | What it means | Why it matters |
|------|---------------|----------------|
| **Parameter** | A numeric weight inside the neural network that the model learns during training. | The more parameters, the more patterns the model can capture. |
| **Checkpoint** | A saved snapshot of all parameters at a specific training step. | Allows you to load a pre‑trained model without training from scratch. |
| **Tokenizer** | A tool that splits text into tokens (words, sub‑words, or characters). | GPT‑OSS uses a byte‑pair‑encoding (BPE) tokenizer to convert your prompt into numbers the model can process. |
| **Inference** | Running the model to generate predictions (text) from a prompt. | This is what you’ll do in the notebook. |

## Trade‑offs to keep in mind
- **Memory vs. Latency**: 20B needs ~30 GB of VRAM for full precision inference. If you’re on a 12 GB GPU, you’ll need to use *mixed‑precision* (FP16) or *quantization* to fit. The trade‑off is a tiny drop in accuracy for a huge speed boost.
- **Speed vs. Quality**: Generating longer passages or using higher temperature settings will slow down inference. For quick demos, keep the prompt short and temperature low.

## Quick sanity check
Below we’ll import the library, load the tokenizer, and print a short snippet of the model’s architecture. This will confirm that everything is wired up correctly.

> **Note**: If you see an error about missing GPU drivers, you might need to install CUDA 12.1 or switch to CPU mode.

## What you’ll learn
- How to import GPT‑OSS and its tokenizer.
- How to inspect the model’s configuration.
- How to set a random seed for reproducible results.

Let’s dive in!



In [None]:
# Import the GPT‑OSS library and set a reproducible seed
# The seed ensures that any random choices (e.g., token sampling) are the same each run
import torch
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Import the model and tokenizer
from gpt_oss import GPTOSS, GPTOSSTokenizer

# Load the tokenizer (this is lightweight and fast)
print("Loading tokenizer…")
try:
    tokenizer = GPTOSSTokenizer.from_pretrained("gpt-oss-20b")
except Exception as e:
    print("Error loading tokenizer:", e)
    raise

# Load the model in FP16 for speed (requires a GPU with at least 12 GB VRAM)
print("Loading model… (this may take a minute)\n")
try:
    model = GPTOSS.from_pretrained("gpt-oss-20b", device_map="cuda:0" if torch.cuda.is_available() else "cpu", torch_dtype=torch.float16)
except Exception as e:
    print("Error loading model:", e)
    raise

# Quick sanity check: print the number of parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Model loaded with {num_params/1e9:.2f} B parameters.")

# Show a tiny slice of the model config
print("\nModel config snippet:")
print(model.config)



# Step 2: Set Up Your Jupyter Notebook

Before we can play with GPT‑OSS, we need a playground that lets us write code, see results, and even build interactive widgets. Think of Jupyter as a *digital notebook* where each page (cell) can contain either a paragraph of text or a snippet of code that runs right away. It’s like a recipe book that lets you tweak ingredients on the fly.

## Why Jupyter matters
- **Live coding**: You can run a line, see the output, and immediately tweak it. No need to write a script, run it, and then open a log file.
- **Rich media**: Images, tables, and interactive widgets can be embedded directly in the notebook.
- **Reproducibility**: By saving the notebook, you capture the exact sequence of commands that produced your results.

## Key terms (and why they matter)
| Term | What it is | Why you care |
|------|------------|--------------|
| **Notebook** | A file with `.ipynb` extension that mixes Markdown and executable code cells. | It’s the main interface for experimenting with GPT‑OSS. |
| **ipywidgets** | A library that turns Python objects into interactive UI elements (sliders, buttons, etc.). | Lets you build demos where you can change temperature or prompt length without editing code. |
| **nbextension** | A Jupyter extension that enables additional features, such as the widgets UI. | Without enabling it, the interactive widgets won’t render. |
| **Environment variable** | A key‑value pair that tells programs where to look for resources (e.g., `GPT_OSS_MODEL_HOME`). | Keeps your model checkpoints organized and portable across machines. |

## Trade‑offs to keep in mind
- **Installation time vs. convenience**: Installing `ipywidgets` and enabling the nbextension takes a few seconds, but it saves you from having to write custom HTML/JavaScript later.
- **GPU vs. CPU**: If you’re on a machine without a GPU, the notebook will still run but will be slower. The setup steps are the same; only the `torch_dtype` flag changes.
- **Version compatibility**: Using the latest `ipywidgets` (≥ 8.0.0) ensures smooth integration with JupyterLab 4.x. Older versions may require additional configuration.

## What you’ll do in this section
1. Install the required Python packages.
2. Enable the widgets extension so you can build interactive demos later.
3. Verify that Jupyter and the extensions are working.

Let’s get started!



In [None]:
# 1️⃣ Install the required packages
# We use the `-q` flag for quiet output and `--upgrade` to ensure we have the latest compatible versions.
# The `try/except` block is optional but helps catch installation errors early.

import subprocess, sys

packages = [
    "gpt-oss==0.1.0",  # the core library
    "ipywidgets>=8.0.0"  # interactive widgets
]

for pkg in packages:
    try:
        print(f"Installing {pkg}…")
        cmd = [sys.executable, "-m", "pip", "install", "-q", "--upgrade", pkg]
        try:
            subprocess.check_call(cmd)
        except Exception as exc:
            if IN_COLAB:
                packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                if packages:
                    try:
                        import IPython
                        ip = IPython.get_ipython()
                        if ip is not None:
                            ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                        else:
                            import subprocess as _subprocess
                            _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                    except Exception as colab_exc:
                        print('⚠️ Colab pip fallback failed:', colab_exc)
                        raise
                else:
                    print('No packages specified for pip install; skipping fallback')
            else:
                raise
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {pkg}: {e}")
        raise

print("✅ All packages installed successfully.")


In [None]:
# 2️⃣ Enable the widgets nbextension
# This command tells Jupyter to load the JavaScript that powers ipywidgets.
# If you’re using JupyterLab, the extension is enabled automatically.

import subprocess, sys

try:
    print("Enabling widgets nbextension…")
    subprocess.check_call([sys.executable, "-m", "jupyter", "nbextension", "enable", "--py", "widgetsnbextension", "--sys-prefix", "-q"])
    print("✅ Widgets extension enabled.")
except subprocess.CalledProcessError as e:
    print(f"⚠️  Could not enable widgets: {e}")
    print("Make sure you have Jupyter installed and the extension is available.")

# 3️⃣ Quick sanity check: print Jupyter version
import jupyter
print(f"Jupyter version: {jupyter.__version__}")


# Step 3: Load & Inspect the Model

In the previous step we pulled the tokenizer and the heavy‑weight 20‑billion‑parameter model into memory.  Think of the model as a gigantic Lego set: each block (parameter) is a tiny piece of knowledge that the model uses to build sentences.  In this step we’ll *inspect* that Lego set to make sure it’s the right size, understand its internal architecture, and run a quick test to see it in action.

## Why inspection matters
- **Debugging**: If the model fails to load or behaves oddly, a quick look at the config can reveal mismatched vocab sizes or missing layers.
- **Performance tuning**: Knowing the number of layers, hidden size, and attention heads helps you decide whether to run on FP16, FP32, or quantized mode.
- **Reproducibility**: Printing the config and seed guarantees that anyone else can replicate your results.

## Key terms (and why they matter)
| Term | What it is | Why you care |
|------|------------|--------------|
| **Device map** | A mapping that tells PyTorch which GPU or CPU each part of the model lives on. | It lets you spread a huge model across multiple GPUs or keep it on a single GPU with sharding. |
| **torch_dtype** | The numerical precision (e.g., `float16`, `float32`) used for the model’s weights. | Lower precision saves memory and speeds up inference but can slightly degrade quality. |
| **Config** | A dictionary of hyper‑parameters that describe the model’s architecture (e.g., `num_hidden_layers`, `hidden_size`). | It’s the blueprint that the library uses to rebuild the model from scratch. |
| **Checkpoint** | A saved snapshot of all model weights. | It’s what you download once and then load many times. |

## Trade‑offs to keep in mind
- **Memory vs. Speed**: FP16 cuts memory usage by ~50 % and doubles throughput on modern GPUs.  If you’re on a 12 GB GPU, FP16 is usually mandatory for 20B.  FP32 gives the most accurate results but will likely OOM.
- **Precision vs. Quality**: Quantization (e.g., 8‑bit) can reduce memory further but may introduce small artifacts.  For most demos, FP16 is the sweet spot.
- **Sharding vs. Single‑GPU**: `device_map="auto"` automatically shards the model across all available GPUs.  If you only have one GPU, you’ll need to enable `torch_dtype=torch.float16` and possibly use `accelerate` for automatic sharding.

## Quick sanity check
Below we’ll:
1. Load the tokenizer and model again (this time with a clear device map).
2. Print the total number of parameters and a snippet of the config.
3. Run a tiny forward pass to generate a short sentence.

> **Tip**: If you see an out‑of‑memory error, try adding `torch_dtype=torch.float16` and/or `device_map="auto"`.

Let’s dive in!



In [None]:
# 1️⃣ Load tokenizer & model with reproducible settings
# -----------------------------------------------------------
# Import libraries
import os
import torch
import random
import numpy as np

# Set a fixed seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Optional: ensure the environment variable points to the weights folder
os.environ.setdefault("GPT_OSS_MODEL_HOME", os.path.expanduser("~/.gpt-oss"))

# Import GPT‑OSS components
from gpt_oss import GPTOSS, GPTOSSTokenizer

# Load tokenizer (lightweight, no GPU needed)
print("Loading tokenizer…")
try:
    tokenizer = GPTOSSTokenizer.from_pretrained("gpt-oss-20b")
except Exception as e:
    print("❌ Tokenizer load failed:", e)
    raise

# Load the model on GPU with FP16 precision
print("Loading model… (this may take a minute)\n")
try:
    model = GPTOSS.from_pretrained(
        "gpt-oss-20b",
        device_map="cuda:0" if torch.cuda.is_available() else "cpu",          # automatically shard across GPUs
        torch_dtype=torch.float16   # use mixed precision for speed/memory
    )
except Exception as e:
    print("❌ Model load failed:", e)
    raise

# 2️⃣ Inspect the model
# ---------------------
# Total number of parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"✅ Model loaded with {num_params/1e9:.2f} B parameters.")

# Show a concise config snippet
print("\nModel config snippet:")
config = model.config
print(f"  vocab_size: {config.vocab_size}")
print(f"  hidden_size: {config.hidden_size}")
print(f"  num_hidden_layers: {config.num_hidden_layers}")
print(f"  num_attention_heads: {config.num_attention_heads}")
print(f"  device_map: {model.device_map}")

# 3️⃣ Quick forward pass – generate a short sentence
# ---------------------------------------------------
prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

# Generate 20 tokens after the prompt
generated_ids = model.generate(
    input_ids,
    max_new_tokens=20,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("\nGenerated text: \n", generated_text)



In [None]:
# 4️⃣ Optional: Visualize the attention pattern for the first token
# ---------------------------------------------------------------
# This is a lightweight example that shows how to extract the attention weights
# for the first token in the generated sequence.  It’s useful for debugging
# and for educational purposes.

# Grab the attention weights from the last layer
with torch.no_grad():
    outputs = model(
        input_ids,
        output_attentions=True
    )

attn = outputs.attentions[-1]  # shape: (batch, heads, seq_len, seq_len)
print("\nAttention shape (last layer):", attn.shape)

# Convert to CPU numpy for plotting (if you want to visualize)
import matplotlib.pyplot as plt
import seaborn as sns

# Take the first head and first token
head0 = attn[0, 0, 0].cpu().numpy()
plt.figure(figsize=(6, 4))
sns.heatmap(head0.reshape(1, -1), cmap="viridis", cbar=False)
plt.title("Attention of first token (head 0) over sequence")
plt.xlabel("Token position")
plt.ylabel("Token")
plt.show()



# Step 4: Generate Your First Prompt

Now that the model is up and running, it’s time to give it a *prompt*—the little seed of text that tells the model what you want it to write. Think of the prompt like a question you ask a friend: the clearer and more specific you are, the better the answer you’ll get.

## What is a *prompt*?
A prompt is simply a string of text that you feed into the model’s tokenizer. The tokenizer turns that string into a sequence of integer IDs (tokens) that the neural network can understand. The model then predicts the next token in the sequence, one by one, until it reaches a stopping condition (like a maximum length or an end‑of‑sentence token).

## Key terms you’ll see in the code
| Term | What it means | Why it matters |
|------|---------------|----------------|
| **prompt** | The user‑supplied text that starts the generation. | It anchors the model’s output in a context you care about. |
| **temperature** | A float that controls randomness in token sampling. | Low values (≈0.2) make the output deterministic; high values (≈1.0) add creativity. |
| **top‑p (nucleus sampling)** | A probability threshold that keeps only the most likely tokens whose cumulative probability exceeds *p*. | It balances diversity and coherence; a lower *p* keeps the output more focused. |
| **max_new_tokens** | The maximum number of tokens the model will generate after the prompt. | It limits latency and output length. |
| **do_sample** | Boolean flag that tells the model to sample from the probability distribution instead of picking the highest‑probability token. | Sampling is required for temperature and top‑p to have an effect. |
| **seed** | A fixed integer that initializes the random number generator. | Ensures reproducible generations across runs. |
| **device** | The hardware (CPU or GPU) where tensors and the model live. | Using a GPU dramatically speeds up inference. |

## Trade‑offs to keep in mind
- **Determinism vs. Creativity**: Setting `temperature=0.0` and `do_sample=False` will always produce the same output for a given prompt, which is great for debugging but not for creative writing. Raising temperature or enabling sampling introduces variability.
- **Speed vs. Length**: `max_new_tokens` controls how many words the model will produce. More tokens mean longer generation time and higher memory usage.
- **Precision vs. Memory**: Running the model in `torch.float16` (FP16) halves memory usage and doubles throughput on modern GPUs, but a tiny loss in numerical precision can slightly affect the output.
- **Sampling Strategy**: Temperature and top‑p are complementary. A high temperature with a low top‑p can still produce coherent text, but if you want truly diverse outputs, increase both.

## Rationale for the code below
We’ll keep the code short and focused on the core generation call. The snippet sets a reproducible seed, builds a prompt, and calls `model.generate`. It also demonstrates how to tweak temperature and top‑p to see the effect on the output. The code is wrapped in a `try/except` block so that any runtime errors (e.g., out‑of‑memory) are caught early.

> **Tip**: If you’re running on a machine with a single 12 GB GPU, keep `torch_dtype=torch.float16` and `device_map="auto"` to avoid OOM errors.



In [None]:
# 1️⃣ Generate text from a simple prompt
# -------------------------------------------------
# Import required libraries (already imported in previous steps, but re‑import for safety)
import torch
import random
import numpy as np

# Re‑set the seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Assume `model` and `tokenizer` are already loaded from Step 3
# If not, you can load them again (see Step 3 for details)

# Define a short, concrete prompt
prompt = "Write a short poem about a sunrise over the ocean"

# Encode the prompt into token IDs and move to the model’s device
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

# Generate 50 new tokens with a moderate temperature and top‑p
generated_ids = model.generate(
    input_ids,
    max_new_tokens=50,          # keep output short for demo purposes
    temperature=0.7,            # a bit of randomness, but not too wild
    top_p=0.9,                  # nucleus sampling for coherence
    do_sample=True,             # enable sampling (required for temperature/top_p)
    pad_token_id=tokenizer.eos_token_id  # avoid warnings on padding
)

# Decode the generated token IDs back to text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("\n=== Generated Text ===\n")
print(generated_text)



In [None]:
# 2️⃣ Quick experiment: change temperature and top‑p
# -------------------------------------------------------
# Feel free to modify the values below to see how the output changes.
# Lower temperature (0.2) → more deterministic, safer output.
# Higher temperature (1.0) → more creative, riskier output.
# Lower top_p (0.8) → stricter selection of tokens.
# Higher top_p (0.95) → more diverse token pool.

for temp, top in [(0.2, 0.9), (1.0, 0.9), (0.7, 0.95)]:
    gen_ids = model.generate(
        input_ids,
        max_new_tokens=30,
        temperature=temp,
        top_p=top,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    print(f"\n--- Temp={temp}, Top‑p={top} ---")
    print(text)



# Step 5: Experiment with Temperature & Top‑P

In the last step we saw how to generate text with a fixed set of hyper‑parameters.  Now we’ll treat *temperature* and *top‑p* as knobs on a soundboard and see how turning them changes the music the model plays.

## What are we turning?
- **Temperature** is like the volume knob on a radio.  A low temperature (≈0.2) makes the model play the most popular, safe songs.  A high temperature (≈1.0) lets it try out more obscure tracks, sometimes producing surprising melodies.
- **Top‑p (nucleus sampling)** is a filter that keeps only the most likely next notes whose cumulative probability reaches *p*.  Think of it as a “focus” button that says, “Only play the top 90 % of the most probable notes.”  A low *p* keeps the song tight; a high *p* opens the door to more variety.

## Extra explanatory paragraph

| Term | What it means | Why it matters |
|------|---------------|----------------|
| **Logits** | Raw, un‑scaled scores the model outputs for every token. | They are the raw “opinions” before turning into probabilities. |
| **Softmax** | The mathematical function that turns logits into a probability distribution. | It ensures the probabilities sum to 1, making sampling possible. |
| **Sampling** | Randomly picking the next token according to the probability distribution. | It injects creativity; deterministic generation uses the highest‑probability token instead. |
| **Temperature scaling** | Dividing logits by *temperature* before softmax. | Low temperatures sharpen the distribution (more deterministic), high temperatures flatten it (more random). |
| **Top‑p threshold** | The smallest cumulative probability *p* that captures the most likely tokens. | It limits the token pool to the most promising candidates, balancing coherence and diversity. |

**Rationale & trade‑offs**
- **Determinism vs. Creativity**: Setting `temperature=0.0` and `do_sample=False` guarantees the same output every time, which is great for debugging or reproducible demos.  Raising temperature or enabling sampling introduces variability, useful for creative writing but harder to test.
- **Coherence vs. Surprise**: A low top‑p (e.g., 0.8) keeps the model within a tight, coherent band of tokens, reducing the chance of nonsensical jumps.  A higher top‑p (e.g., 0.95) opens the door to more unexpected words, which can be exciting but may also break the narrative flow.
- **Speed vs. Length**: Both temperature and top‑p affect the number of tokens the model needs to evaluate.  Higher temperature can cause the model to wander longer before hitting a stop token, slightly increasing latency.
- **Memory vs. Precision**: These parameters do not change memory usage, but the choice of `torch_dtype` (FP16 vs. FP32) does.  FP16 halves memory and doubles throughput on modern GPUs, at the cost of a tiny numerical precision loss.

## Quick sanity check
Below we’ll:
1. Define a helper that generates text given temperature and top‑p.
2. Run a few experiments to see how the output changes.
3. Wrap the helper in an interactive widget so you can play with the knobs in real time.

> **Tip**: Keep `torch_dtype=torch.float16` and `device_map="auto"` if you’re on a single 12 GB GPU to avoid out‑of‑memory errors.



In [None]:
# 1️⃣ Helper to generate text with temperature & top‑p
# ----------------------------------------------------------
# Assumes `model` and `tokenizer` are already loaded (see Step 4)
# We keep the seed fixed for reproducibility.
import torch, random, numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


def generate_text(prompt, temp=0.7, top_p=0.9, max_new=50):
    """Return a generated string for *prompt*.

    Parameters
    ----------
    prompt : str
        The seed text.
    temp : float
        Temperature scaling.
    top_p : float
        Nucleus sampling threshold.
    max_new : int
        Max tokens to generate.
    """
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    gen_ids = model.generate(
        input_ids,
        max_new_tokens=max_new,
        temperature=temp,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)

# Quick demo with three settings
prompts = ["Once upon a time", "The quick brown fox", "In a galaxy far, far away"]
settings = [
    (0.2, 0.9),
    (0.7, 0.9),
    (1.0, 0.95),
]

for prompt in prompts:
    print(f"\n=== Prompt: {prompt} ===")
    for temp, top in settings:
        out = generate_text(prompt, temp=temp, top_p=top, max_new=30)
        print(f"\nTemp={temp:.1f}, Top‑p={top:.2f} →\n{out}\n")



In [None]:
# 2️⃣ Interactive slider to play with temperature & top‑p
# ----------------------------------------------------------
# Requires ipywidgets.  If you haven’t installed it, run:
#   pip install ipywidgets>=8.0.0

import ipywidgets as widgets
from IPython.display import display

@widgets.interact
def interactive_demo(
    prompt: widgets.Text(value="Write a short story about a robot", description="Prompt:"),
    temperature: widgets.FloatSlider(value=0.7, min=0.1, max=1.5, step=0.1, description="Temperature:"),
    top_p: widgets.FloatSlider(value=0.9, min=0.5, max=1.0, step=0.05, description="Top‑p:"),
    max_new: widgets.IntSlider(value=50, min=10, max=200, step=10, description="Max tokens:"),
):
    """Generate text live as you adjust the sliders."""
    out = generate_text(prompt, temp=temperature, top_p=top_p, max_new=max_new)
    print("\n--- Generated Text ---\n")
    print(out)



# Step 6: Use ipywidgets for Interactive Demo

In the previous steps we learned how to load GPT‑OSS and generate text with fixed hyper‑parameters.  Now we’ll turn the notebook into a *live playground* where you can tweak the prompt, temperature, and top‑p on the fly and see the model’s response instantly.

## Why interactive demos matter
Think of a recipe book that lets you change the amount of salt or the cooking time and immediately taste the result.  An interactive widget does the same for language models: you can experiment with different settings without editing code or re‑running long cells.  This is especially useful when you want to *explore* the model’s behavior or when you’re teaching others how the knobs affect output.

## Key terms (and why they matter)
| Term | What it is | Why you care |
|------|------------|--------------|
| **ipywidgets** | A Python library that turns objects into UI elements (sliders, text boxes, buttons). | Lets you build interactive controls inside a Jupyter notebook. |
| **Interact** | A decorator that automatically creates widgets from function arguments. | Simplifies the wiring between UI and code – no manual widget layout needed. |
| **Slider** | A UI element that lets you pick a numeric value within a range. | Controls continuous parameters like temperature or top‑p. |
| **Text** | A widget that accepts free‑form text input. | Lets you change the prompt without editing code. |
| **Output area** | The region where the generated text appears. | Provides immediate visual feedback. |

## Extra explanatory paragraph
The interactive demo is built on top of the `generate_text` helper we defined in Step 5.  That helper takes a prompt, temperature, top‑p, and maximum token count, then returns the decoded string.  The `ipywidgets.interact` decorator automatically creates a UI for each argument: a text box for the prompt, sliders for temperature and top‑p, and a slider for the maximum number of tokens.  When you move a slider or type a new prompt, the function runs again and prints the new output.  This real‑time feedback loop is invaluable for understanding how each hyper‑parameter shapes the model’s creativity and coherence.

**Trade‑offs to keep in mind**
- **Speed vs. Interactivity**: Generating 200 tokens can take a few seconds, especially on a single GPU.  The UI will freeze until the generation finishes, so keep `max_new_tokens` moderate for a smooth experience.
- **Memory vs. Precision**: Using `torch.float16` (FP16) halves VRAM usage and doubles throughput on modern GPUs, but a tiny loss in numerical precision can slightly affect the output.  For demos, FP16 is usually the sweet spot.
- **Determinism vs. Exploration**: Setting `temperature=0.0` and `do_sample=False` makes the output repeatable, which is great for debugging.  Raising temperature or enabling sampling introduces variability, which is useful for creative writing but harder to test.

Let’s put it all together and create a live demo.



In [None]:
# 1️⃣ Helper to generate text (re‑used from Step 5)
# ----------------------------------------------------------
# Assumes `model` and `tokenizer` are already loaded.
import torch, random, numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


def generate_text(prompt, temp=0.7, top_p=0.9, max_new=50):
    """Return a generated string for *prompt*.

    Parameters
    ----------
    prompt : str
        The seed text.
    temp : float
        Temperature scaling.
    top_p : float
        Nucleus sampling threshold.
    max_new : int
        Max tokens to generate.
    """
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    gen_ids = model.generate(
        input_ids,
        max_new_tokens=max_new,
        temperature=temp,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)



In [None]:
# 2️⃣ Interactive demo with ipywidgets
# ----------------------------------------------------------
# Requires ipywidgets.  If you haven’t installed it, run:
#   pip install ipywidgets>=8.0.0

import ipywidgets as widgets
from IPython.display import display

@widgets.interact
def interactive_demo(
    prompt: widgets.Text(value="Write a short story about a robot", description="Prompt:"),
    temperature: widgets.FloatSlider(value=0.7, min=0.1, max=1.5, step=0.1, description="Temperature:"),
    top_p: widgets.FloatSlider(value=0.9, min=0.5, max=1.0, step=0.05, description="Top‑p:"),
    max_new: widgets.IntSlider(value=50, min=10, max=200, step=10, description="Max tokens:"),
):
    """Generate text live as you adjust the sliders."""
    out = generate_text(prompt, temp=temperature, top_p=top_p, max_new=max_new)
    print("\n--- Generated Text ---\n")
    print(out)



## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which parameter controls the diversity of the output?", ["Batch size","Temperature","Learning rate","Optimizer"], 1, "Temperature is a knob that adjusts how random the model’s predictions are; a higher temperature makes the output more diverse.")


In [None]:
render_mcq("What does the variable GPT_OSS_MODEL_HOME specify?", ["The path to the Jupyter installation","Where the GPT-OSS checkpoints are stored","The name of the model architecture","The default prompt text"], 1, "GPT_OSS_MODEL_HOME tells the library where to look for the pre‑downloaded model weights.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
