In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Getting Started with GPT‑OSS‑20B: A Beginner’s Guide

This lesson introduces the GPT‑OSS‑20B language model to absolute beginners. We’ll walk through setting up a Jupyter environment, loading the model, running simple prompts, and troubleshooting common issues—all explained with everyday analogies and minimal jargon.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T02:44:34.608Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Explain what GPT‑OSS‑20B is and how it differs from smaller models.
2. Show how to install and configure the required libraries, including ipywidgets.
3. Demonstrate how to load the model and generate text in a Jupyter notebook.
4. Identify common pitfalls and best practices for working with large language models.


## Prerequisites

- Basic familiarity with Python syntax (variables, functions).
- Access to a Jupyter notebook environment (e.g., Anaconda, Google Colab).


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 transformers>=4.30.0 torch>=2.0.0
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","transformers>=4.30.0","torch>=2.0.0"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


## Step 1: Introduction and Setup

Welcome to the first step of your journey with GPT‑OSS‑20B! Think of GPT‑OSS‑20B as a gigantic library of stories that has read billions of books, articles, and webpages. Just like a librarian who can pull out the right book in a flash, this model can generate text that feels natural and relevant.

In this section we’ll:

1. **Install the software** you need to talk to the model.
2. **Set up a reproducible environment** so that your results can be shared and re‑run.
3. **Verify that everything is working** before we dive into the model itself.

> **Why do we need all these steps?**
> 
> Large language models are heavy‑weight beasts. They require the right versions of PyTorch, the Transformers library, and a few helper tools like ipywidgets. Installing the wrong version can lead to cryptic errors that are hard to debug. By following a clear, reproducible setup we avoid those headaches and make sure that anyone who copies your notebook will see the same results.

### Key Terms Explained

- **GPT‑OSS‑20B**: A 20‑billion‑parameter transformer model released by Hugging Face. Parameters are like the tiny knobs that let the model remember patterns in language.
- **PyTorch**: The deep‑learning framework that actually runs the math behind the model.
- **Transformers**: A high‑level library that wraps PyTorch to make it easier to load and use models.
- **ipywidgets**: A library that lets you add interactive sliders, buttons, and text boxes to Jupyter notebooks.
- **Reproducibility**: The practice of setting random seeds and using fixed library versions so that results can be exactly replicated.

> **Trade‑offs**: Using the latest library versions gives you new features and bug fixes, but sometimes those updates break compatibility with older code. By pinning versions (e.g., `transformers>=4.30.0`) we strike a balance between stability and access to recent improvements.

### Quick Checklist

- [ ] Python 3.10+ installed
- [ ] Jupyter Notebook or JupyterLab running
- [ ] Internet connection (to download packages and the model)

Let’s get started!



In [None]:
# Install required packages with error handling
# This cell will run the pip install commands only if the packages are missing.
# It also prints a friendly message so you know what’s happening.

import subprocess, sys

packages = [
    "ipywidgets>=8.0.0",
    "transformers>=4.30.0",
    "torch>=2.0.0"
]

for pkg in packages:
    try:
        __import__(pkg.split('>=')[0])
        print(f"{pkg} already installed")
    except ImportError:
        print(f"Installing {pkg}…")
        cmd = [sys.executable, "-m", "pip", "install", pkg]
        try:
            subprocess.check_call(cmd)
        except Exception as exc:
            if IN_COLAB:
                packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                if packages:
                    try:
                        import IPython
                        ip = IPython.get_ipython()
                        if ip is not None:
                            ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                        else:
                            import subprocess as _subprocess
                            _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                    except Exception as colab_exc:
                        print('⚠️ Colab pip fallback failed:', colab_exc)
                        raise
                else:
                    print('No packages specified for pip install; skipping fallback')
            else:
                raise

# Enable ipywidgets in the notebook environment
try:
    import ipywidgets
    print("ipywidgets is ready!")
except Exception as e:
    print("Error importing ipywidgets:", e)



## Setting Up Your Environment Variable

GPT‑OSS‑20B is hosted on Hugging Face’s model hub. To access it, you’ll need an API key. Store it in a file called `.env` in the same folder as your notebook, or set it directly in a notebook cell:

```python
import os
os.environ["HF_TOKEN"] = "YOUR_HUGGING_FACE_API_KEY"
```

> **Tip**: Keep your API key secret! Don’t commit the `.env` file to version control.



In [None]:
# Set a random seed for reproducibility
# This ensures that any random choices (e.g., dropout, sampling) are the same each run.

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
print(f"Random seed set to {SEED}")



## Section 2

Thinking...
>We need to produce JSON structure with section_number 2, title "Step 2: What is GPT‑OSS‑20B?", content array with markdown and code cells, callouts array, estimated_tokens 1000, prerequisites_check array, next_section_hint.
>
>We need to target 800-1000 tokens. Provide markdown explanation with analogies, extra paragraph defining key terms and explaining rationale/trade-offs. Provide code cell with <=30 lines, maybe a small snippet to load model or show token counts.
>
>We need to i...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Step 3: Preparing Your Notebook

### Why a “Notebook Prep” step?

Think of a Jupyter notebook as a kitchen where you’ll cook up a delicious AI recipe. Before you start chopping ingredients, you need to make sure the stove is on, the pans are clean, and the spices are measured. In the same way, before we ask GPT‑OSS‑20B to generate text, we need to make sure the notebook environment is ready: the right libraries are loaded, the GPU is available, and the random seed is set so that the same recipe yields the same dish every time.

### What will we do in this step?

1. **Verify GPU availability** – Large models like GPT‑OSS‑20B are heavy; they run best on a GPU.
2. **Set up a deterministic environment** – We’ll lock the random seed for NumPy, Python’s `random`, and PyTorch.
3. **Create a small helper function** – A reusable wrapper that prints the device and memory usage so you can keep an eye on resource consumption.
4. **Add a quick sanity‑check** – A tiny test that confirms the environment can run a minimal inference without errors.

> **Analogy**: Imagine you’re a chef who wants to replicate a signature dish exactly. You’d keep a notebook of the exact oven temperature, the exact amount of salt, and the exact timing. That’s what we’re doing here, but for a neural network.

### Key Terms Explained

- **GPU (Graphics Processing Unit)**: A specialized piece of hardware that can perform many calculations in parallel, making it ideal for training and inference of deep learning models.
- **CUDA**: NVIDIA’s parallel computing platform that allows Python code to run on the GPU.
- **Random Seed**: A starting value for pseudo‑random number generators. Setting the same seed ensures that operations that involve randomness (like dropout or token sampling) produce identical results across runs.
- **Determinism**: The property that a program will produce the same output given the same input and environment. In deep learning, full determinism is hard to achieve due to non‑deterministic GPU operations, but setting seeds reduces variance.
- **Memory Usage**: The amount of GPU RAM consumed by the model and its intermediate tensors. Monitoring this helps avoid out‑of‑memory (OOM) crashes.

> **Trade‑offs**: Enabling full determinism (e.g., by disabling certain CUDA optimizations) can slow down inference. For most educational purposes, a fixed seed with the default CUDA settings offers a good balance between reproducibility and speed.

### Quick Checklist

- [ ] GPU is available and CUDA is working.
- [ ] Random seed is set for all libraries.
- [ ] `torch.cuda.memory_allocated()` can be queried.
- [ ] A minimal inference test runs without errors.

Let’s put this into code.



In [None]:
# ------------------------------------------------------------
#  Notebook preparation utilities
# ------------------------------------------------------------
import os
import random
import numpy as np
import torch

# 1️⃣ Set a global random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
print(f"✅ Random seed set to {SEED}")

# 2️⃣ Check GPU availability and basic CUDA info
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"✅ GPU detected: {torch.cuda.get_device_name(device)}")
    print(f"   Total GPU memory: {torch.cuda.get_device_properties(device).total_memory / (1024**3):.2f} GB")
else:
    device = torch.device("cpu")
    print("⚠️  No GPU found – falling back to CPU (may be slow).")

# 3️⃣ Helper to report current memory usage

def report_memory(label: str = "Current"):
    """Prints the amount of GPU memory allocated and cached."""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(device) / (1024**3)
        cached = torch.cuda.memory_reserved(device) / (1024**3)
        print(f"{label} GPU memory – allocated: {allocated:.2f} GB, cached: {cached:.2f} GB")
    else:
        print(f"{label} – CPU mode, memory reporting not available.")

# 4️⃣ Sanity‑check: run a tiny inference on a dummy token
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load a very small tokenizer/model for the check (does not use GPT‑OSS‑20B)
# This keeps the check lightweight and fast.
try:
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    dummy_input = tokenizer("Hello world", return_tensors="pt").to(device)
    dummy_model = AutoModelForCausalLM.from_pretrained("distilbert-base-uncased").to(device)
    with torch.no_grad():
        _ = dummy_model(**dummy_input)
    print("✅ Sanity check passed – dummy inference succeeded.")
except Exception as e:
    print(f"❌ Sanity check failed: {e}")

# Report memory after sanity check
report_memory("After sanity check")



## Step 4: Loading the Model

### Why do we need a separate loading step?

Think of GPT‑OSS‑20B as a gigantic library that lives in a remote data center. The *loading* process is like pulling the entire library onto a local shelf so you can browse it quickly. If you try to read a book that’s still in the cloud, you’ll have to wait for each page to download, which is slow and wasteful. By loading the model once into memory, you pay the cost of the download only once and then you can generate text instantly.

### What will happen in this cell?

1. **Pull the tokenizer** – the piece that turns your text into numbers the model understands.
2. **Pull the model weights** – the 20‑billion‑parameter neural network that actually does the heavy lifting.
3. **Move everything to the best device** – GPU if available, otherwise CPU.
4. **Set a deterministic seed** – so that the same prompt always gives the same first token.
5. **Run a tiny sanity‑check** – generate a single token to confirm everything is wired up.

> **Analogy**: Loading the model is like installing a huge software package on your computer. Once it’s installed, you can launch it instantly. If you try to run it from the internet every time, you’d be waiting for the download to finish each time.

### Key Terms Explained

- **Tokenizer**: A tool that converts human‑readable text into a sequence of integer IDs that the model can process. Think of it as a translator that maps words to numbers.
- **Model**: The neural network itself, consisting of layers of weights that have learned language patterns. For GPT‑OSS‑20B, it has 20 billion such weights.
- **Device**: The hardware (CPU or GPU) where tensors live. GPUs can perform many operations in parallel, making inference faster.
- **Context length**: The maximum number of tokens the model can look at at once. GPT‑OSS‑20B supports up to 4 096 tokens.
- **Determinism**: Setting a random seed ensures that operations involving randomness (like dropout or sampling) produce the same results each run.

> **Trade‑offs**: Loading the full 20B model requires a lot of GPU memory (≈30 GB). If your GPU is smaller, you’ll hit an out‑of‑memory error. In that case, you can either use a smaller model (e.g., GPT‑OSS‑3B) or run on CPU, which is slower but still works. The choice depends on your hardware and the speed you need.

### Quick Checklist

- [ ] `HF_TOKEN` is set in the environment.
- [ ] `transformers`, `torch`, and `ipywidgets` are installed.
- [ ] GPU is available (if you want GPU inference).
- [ ] Random seed is set.

Let’s load the model now.



In [None]:
# ------------------------------------------------------------
#  Load GPT‑OSS‑20B tokenizer and model
# ------------------------------------------------------------
import os
import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1️⃣ Ensure reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣ Determine device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {DEVICE}")

# 3️⃣ Load tokenizer (fast tokenizers are optional but faster)
try:
    tokenizer = AutoTokenizer.from_pretrained(
        "huggingface/gpt-oss-20b",
        use_fast=True,
        token=os.getenv("HF_TOKEN")
    )
    print("✅ Tokenizer loaded")
except Exception as e:
    print(f"❌ Failed to load tokenizer: {e}")
    raise

# 4️⃣ Load model weights – this can take a few minutes
try:
    model = AutoModelForCausalLM.from_pretrained(
        "huggingface/gpt-oss-20b",
        torch_dtype=torch.float16 if DEVICE.type == "cuda" else torch.float32,
        low_cpu_mem_usage=True,
        token=os.getenv("HF_TOKEN")
    ).to(DEVICE)
    print("✅ Model loaded and moved to device")
except RuntimeError as e:
    print(f"❌ RuntimeError during model load: {e}")
    print("⚠️  Try reducing the batch size or using a smaller model.")
    raise

# 5️⃣ Quick sanity‑check: generate a single token
prompt = "Once upon a time"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
with torch.no_grad():
    output_ids = model.generate(input_ids, max_new_tokens=1)
print("✅ Sanity check passed – generated token:", tokenizer.decode(output_ids[0]))



### What to do next?

You now have the tokenizer and model in memory, ready to generate text. In the next step we’ll feed a prompt and let the model produce a continuation. If you hit an out‑of‑memory error, try the following:

- Use `torch_dtype=torch.float16` (already set for GPU) or `torch_dtype=torch.bfloat16` if your GPU supports it.
- Load the model with `low_cpu_mem_usage=True` (already set) to stream weights from disk.
- Switch to a smaller model like `gpt-oss-3b`.

Feel free to experiment with the `max_new_tokens` parameter in the `generate` call to produce longer or shorter outputs.



## Step 5: Generating Text

### Why generate text?

Imagine you’re a chef who has just finished preparing a huge batch of dough. The dough is ready, but you still need to decide what shape to bake it into. Generating text is the same: we give the model a *prompt* (the dough) and ask it to *continue* (shape the dough into a story, answer, or code snippet). The model’s job is to predict the next token (word or sub‑word) one step at a time until we stop it.

### What will happen in this cell?

1. **Wrap the generation logic in a reusable function** – so you can experiment with different prompts and settings without rewriting code.
2. **Show a few key generation parameters** – `max_new_tokens`, `temperature`, `top_p`, and `do_sample`.
3. **Run a quick demo** – generate a short continuation of a user‑supplied prompt.
4. **Explain how each parameter affects the output** – using everyday analogies.

> **Analogy**: Think of `temperature` as the *spice level* in a recipe. A low temperature (close to 0) gives you a very predictable, bland dish. A high temperature (close to 1) adds excitement but can also make the dish a bit chaotic.

### Key Terms Explained

- **Token**: The smallest unit the model understands. Tokens can be whole words, parts of words, or punctuation. Think of them as the *letters* that build words.
- **Prompt**: The text you give to the model to start the generation. It’s like the first sentence of a story.
- **Generation Parameters**:
  - `max_new_tokens`: How many new tokens the model should produce. It’s the *length* of the dish.
  - `temperature`: Controls randomness. Lower values make the model more deterministic; higher values make it more creative.
  - `top_p` (nucleus sampling): Keeps the model’s choices within the most probable `p` fraction of the distribution. It’s a way to prune unlikely words.
  - `do_sample`: If `True`, the model samples from the probability distribution; if `False`, it picks the most likely token (greedy decoding).
- **Determinism vs Creativity**: Setting a seed and using low temperature gives reproducible, safe outputs. Raising temperature or enabling sampling introduces variability, which can be useful for creative tasks but may produce nonsensical results.
- **Trade‑offs**:
  - **Speed vs Quality**: Sampling (`do_sample=True`) requires extra computation because the model must evaluate probabilities for many tokens. Greedy decoding is faster but can get stuck in repetitive loops.
  - **Memory vs Flexibility**: Generating many tokens (`max_new_tokens` large) consumes more GPU memory because intermediate tensors must be stored. For very long outputs, consider streaming or chunking.

### Quick Checklist

- [ ] `model` and `tokenizer` are loaded and on the correct device.
- [ ] Random seed is set for reproducibility.
- [ ] You understand how `temperature`, `top_p`, and `max_new_tokens` influence output.
- [ ] You’re ready to experiment with different prompts.

Let’s dive into the code.



In [None]:
# ------------------------------------------------------------
#  Text generation helper
# ------------------------------------------------------------
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1️⃣ Ensure reproducibility (seed already set in previous steps)
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 2️⃣ Helper function for generation

def generate_text(
    prompt: str,
    max_new_tokens: int = 50,
    temperature: float = 0.7,
    top_p: float = 0.9,
    do_sample: bool = True,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> str:
    """Generate a continuation for *prompt* using the global *model* and *tokenizer*.

    Parameters
    ----------
    prompt: str
        The starting text.
    max_new_tokens: int
        How many new tokens to generate.
    temperature: float
        Controls randomness. 0 = deterministic, 1 = fully random.
    top_p: float
        Nucleus sampling threshold.
    do_sample: bool
        If False, use greedy decoding.
    device: torch.device
        CPU or GPU.
    """
    # Tokenize prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # Generate
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=do_sample,
            pad_token_id=tokenizer.eos_token_id,
        )
    # Decode and return
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 3️⃣ Demo: generate a short story continuation
prompt_text = "The ancient library held a secret that no one had ever discovered."
print("\n--- Prompt ---")
print(prompt_text)
print("\n--- Generated Text ---")
print(generate_text(prompt_text, max_new_tokens=80, temperature=0.8, top_p=0.95))



## Step 6: Interactive Prompting with ipywidgets

In the previous steps we saw how to feed a prompt to GPT‑OSS‑20B and get a response back. That was a *static* interaction: you typed the prompt in a code cell, ran it, and saw the output. Interactive prompting turns that into a *live* chat‑like experience, just like a text‑based adventure game where you type a command and the game responds immediately.

### Why use ipywidgets?

Think of ipywidgets as a set of building blocks that let you add buttons, sliders, and text boxes to a notebook, just like you would add knobs and switches to a control panel. When you click a button or type something, the widget can *observe* the change and run a piece of code automatically. This is handy for:

- Quickly testing different prompts without editing code.
- Demonstrating how changing temperature or max tokens affects the output.
- Building a simple chatbot that users can play with.

### Key Terms Explained

- **Widget**: A UI element (e.g., Text, Button, Output) that lives inside a Jupyter notebook.
- **Observer**: A callback function that runs whenever a widget’s value changes.
- **Output**: A special widget that captures and displays printed text or plots.
- **Event loop**: The mechanism that keeps the notebook responsive while widgets wait for user actions.
- **Reproducibility**: Setting a seed ensures that the same prompt always produces the same first token, but interactive widgets can still introduce variability if sampling is enabled.

> **Trade‑offs**: Using ipywidgets adds a small amount of overhead because the notebook has to maintain the UI state and handle events. For very large models, each generation can take several seconds, so the UI may feel a bit laggy. However, the benefit of immediate visual feedback far outweighs this minor delay for educational purposes.

### Quick Checklist

- [ ] `ipywidgets` is installed and enabled (`%load_ext widgetsnbextension`).
- [ ] The `model` and `tokenizer` are already loaded and on the correct device.
- [ ] Random seed is set for deterministic sampling if desired.
- [ ] You understand how to modify generation parameters via the UI.

Let’s build the interactive prompt now.



In [None]:
# ------------------------------------------------------------
#  Interactive prompt using ipywidgets
# ------------------------------------------------------------
import ipywidgets as widgets
from IPython.display import display, clear_output
import torch

# Assume global `model`, `tokenizer`, and `DEVICE` are already defined
# (from the previous loading step).  We keep the same seed for reproducibility.
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 1️⃣ Create widgets
prompt_box = widgets.Text(
    value="Hello, GPT‑OSS‑20B!",
    placeholder="Type your prompt here…",
    description="Prompt:",
    layout=widgets.Layout(width='80%')
)

max_tokens_slider = widgets.IntSlider(
    value=50,
    min=10,
    max=200,
    step=10,
    description='Max tokens:'
)

temp_slider = widgets.FloatSlider(
    value=0.7,
    min=0.0,
    max=1.0,
    step=0.05,
    description='Temperature:'
)

sample_toggle = widgets.Checkbox(
    value=True,
    description='Sample (vs greedy)'
)

run_button = widgets.Button(description='Generate', button_style='success')
output_area = widgets.Output()

# 2️⃣ Define the generation logic

def on_generate_clicked(_):
    with output_area:
        clear_output()
        prompt = prompt_box.value
        max_new = max_tokens_slider.value
        temp = temp_slider.value
        do_sample = sample_toggle.value
        # Tokenize and generate
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
        with torch.no_grad():
            gen_ids = model.generate(
                input_ids,
                max_new_tokens=max_new,
                temperature=temp,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id
            )
        text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
        print(text)

# 3️⃣ Attach the callback
run_button.on_click(on_generate_clicked)

# 4️⃣ Display the UI
ui = widgets.VBox([
    prompt_box,
    widgets.HBox([max_tokens_slider, temp_slider, sample_toggle]),
    run_button,
    output_area
])
display(ui)



## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which of the following is NOT a recommended practice when working with GPT‑OSS‑20B?", ["Use a small batch size to reduce memory usage.","Always set a random seed for reproducibility.","Avoid using ipywidgets for interactive demos.","Monitor GPU memory usage during inference."], 2, "Using ipywidgets is encouraged for interactive demos; avoiding it limits user experience.")


In [None]:
render_mcq("Which library is essential for building interactive widgets in Jupyter?", ["numpy","ipywidgets","matplotlib","pandas"], 1, "ipywidgets provides the tools needed to create interactive UI elements within Jupyter notebooks.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
