In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Deploying and Fine‑Tuning GPT‑OSS‑20B for Real‑World Applications

This notebook guides practitioners through the end‑to‑end workflow of loading, evaluating, fine‑tuning, and deploying the 20B‑parameter GPT‑OSS model. It balances hands‑on code with applied explanations, covering prompt engineering, performance tuning, safety considerations, and deployment strategies.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T03:53:46.432Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Understand the architecture and tokenization pipeline of GPT‑OSS‑20B.
2. Load the model efficiently using Hugging Face and accelerate inference with GPU.
3. Apply basic fine‑tuning on a domain‑specific dataset and evaluate performance.
4. Deploy the fine‑tuned model as a REST API with FastAPI and monitor latency.


## Prerequisites

- Python 3.10+
- Basic knowledge of PyTorch and Hugging Face Transformers


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 torch>=2.0.0 transformers>=4.40.0 datasets>=2.18.0 accelerate>=0.30.0 fastapi>=0.110.0 uvicorn>=0.29.0
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","torch>=2.0.0","transformers>=4.40.0","datasets>=2.18.0","accelerate>=0.30.0","fastapi>=0.110.0","uvicorn>=0.29.0"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


## Step 1: Introduction and Environment Setup

Welcome to the first step of our journey with **GPT‑OSS‑20B**, a 20‑billion‑parameter language model that can generate human‑like text, answer questions, and even help you write code. Think of the model as a gigantic library of sentences that it has read during training; when you give it a prompt, it looks up the most likely next words based on that library.

### Why a dedicated environment?
Large models like GPT‑OSS‑20B require a lot of memory and compute. To avoid clashes with other projects and to make sure the GPU is used correctly, we’ll create a clean Python environment and install the exact library versions that the model expects. This is similar to setting up a clean kitchen before cooking a complex recipe: you want all the ingredients in the right amounts and no leftover spices from previous dishes.

### Key terms explained
- **Tokenizer** – A tool that splits text into *tokens* (words or sub‑words). GPT‑OSS‑20B uses a *Byte‑Pair Encoding* tokenizer, which means it breaks words into smaller pieces that the model can understand.
- **CUDA_VISIBLE_DEVICES** – An environment variable that tells PyTorch which GPU(s) to use. Setting it to `0` means we’ll use the first GPU in the system.
- **HF_TOKEN** – Your Hugging Face authentication token. It allows the `transformers` library to download the model weights from the Hugging Face Hub.
- **Seed** – A number that initializes random number generators. Using a fixed seed ensures that experiments are reproducible.

### Trade‑offs
- **Memory vs. Speed**: Loading the full 20B model on a single GPU can exceed memory limits. We’ll use *gradient checkpointing* during fine‑tuning to trade compute for memory, and *FP16* precision during inference to speed up processing.
- **Precision vs. Accuracy**: FP16 reduces memory usage and speeds up inference but may slightly degrade output quality. For most applications, the trade‑off is negligible.

### What you’ll do in this section
1. Install the required packages.
2. Verify that your GPU is visible to PyTorch.
3. Set a random seed for reproducibility.
4. Load the tokenizer to confirm everything works.

Let’s get started!



In [None]:
# ------------------------------------------------------------
# 1️⃣  Install required packages (run once in a fresh environment)
# ------------------------------------------------------------
# Note: In a Jupyter notebook you can use !pip, but here we provide the
# command for clarity. If you are using a conda environment, replace
# pip with conda install.
#
# !pip install -U ipywidgets torch==2.0.0 transformers==4.40.0 datasets==2.18.0 accelerate==0.30.0 fastapi==0.110.0 uvicorn==0.29.0

# ------------------------------------------------------------
# 2️⃣  Import libraries and set a reproducible seed
# ------------------------------------------------------------
import os
import random
import numpy as np
import torch
from transformers import AutoTokenizer

# Set environment variables (adjust if you have multiple GPUs)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# HF_TOKEN should be set in your environment; we just read it here.
HF_TOKEN = os.getenv("HF_TOKEN", "")

# Reproducibility: same seed → same random numbers
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ------------------------------------------------------------
# 3️⃣  Verify GPU visibility
# ------------------------------------------------------------
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")

# ------------------------------------------------------------
# 4️⃣  Load the tokenizer to confirm everything works
# ------------------------------------------------------------
model_name = "gpt-oss-20b"
print(f"Loading tokenizer for {model_name}…")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=HF_TOKEN)
    sample = tokenizer("Hello, world!", return_tensors="pt")
    print("Tokenizer loaded successfully. Sample token IDs:", sample["input_ids"])  # shape: [1, 3]
except Exception as e:
    print("Error loading tokenizer:", e)



## Step 2: GPT‑OSS‑20B Architecture Overview

Welcome back! In this section we’ll peek under the hood of **GPT‑OSS‑20B** and see how its 20 billion parameters are organized. Think of the model as a *mega‑factory* that turns a string of characters into a probability distribution over the next character.

### 1️⃣  The Transformer Blueprint

GPT‑OSS‑20B is built on the **Transformer** architecture, which is a stack of identical *decoder* blocks. Each block contains two main sub‑layers:

1. **Self‑Attention** – the block looks at every token in the input and decides how much it should pay attention to each other token. Imagine a classroom where every student whispers to every other student; the attention weights are the whispers.
2. **Feed‑Forward Network (FFN)** – after the attention has mixed the information, the FFN applies a small neural net to each token independently, adding a non‑linear transformation.

Both sub‑layers are wrapped in a **residual connection** (the output is added to the input) and a **LayerNorm** that stabilises training.

### 2️⃣  Token Embeddings & Positional Encoding

The first step is to convert each token into a dense vector (the *token embedding*). GPT‑OSS‑20B uses a **Byte‑Pair Encoding (BPE)** tokenizer, which splits words into sub‑words so that rare words can still be represented.

Because the model is *autoregressive*, it needs to know the position of each token. Instead of a separate positional embedding matrix, GPT‑OSS‑20B uses a **learned positional embedding** that is added to the token embedding.

### 3️⃣  Layer Dimensions

| Parameter | Value | Meaning |
|-----------|-------|---------|
| `num_hidden_layers` | 32 | Number of decoder blocks |
| `hidden_size` | 4096 | Size of each token vector |
| `num_attention_heads` | 32 | Heads in multi‑head attention |
| `intermediate_size` | 11008 | Size of the FFN inner layer |

With 32 layers, each of width 4096, the total parameter count reaches ~20 B.

### 4️⃣  Extra Explanatory Paragraph

**Key terms**:
- **Self‑Attention**: a mechanism that lets each token weigh every other token’s contribution.
- **Multi‑Head Attention**: splits the attention into several *heads* so the model can capture different relationships.
- **LayerNorm**: normalises the activations to keep gradients stable.
- **Residual Connection**: adds the input of a sub‑layer to its output, helping gradients flow.
- **Positional Encoding**: injects token order information.

**Rationale & Trade‑offs**:
- *Depth vs Width*: More layers (depth) allow the model to learn hierarchical patterns, but increase memory and compute. GPT‑OSS‑20B balances depth (32) with a wide hidden size (4096) to capture rich semantics.
- *Attention Heads*: 32 heads give fine‑grained relational modeling but add memory overhead.
- *Precision*: Using FP16 during inference reduces memory by ~50 % and speeds up computation, at the cost of a tiny drop in numerical precision.

### 5️⃣  Quick Code Demo

Below we load the model configuration and run a tiny forward pass to see the shapes of tensors. This will confirm that the architecture is loaded correctly.




In [None]:
# ------------------------------------------------------------
# 1️⃣  Load the GPT‑OSS‑20B configuration
# ------------------------------------------------------------
import os
import torch
from transformers import AutoConfig, AutoModelForCausalLM

# Set reproducibility seed
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Load config (does not download weights)
config = AutoConfig.from_pretrained("gpt-oss-20b")
print("Model configuration loaded:")
print(f"  Hidden size: {config.hidden_size}")
print(f"  Number of layers: {config.num_hidden_layers}")
print(f"  Attention heads: {config.num_attention_heads}")
print(f"  Intermediate size: {config.intermediate_size}")

# ------------------------------------------------------------
# 2️⃣  Instantiate the model (weights will be downloaded lazily)
# ------------------------------------------------------------
model = AutoModelForCausalLM.from_pretrained(
    "gpt-oss-20b",
    torch_dtype=torch.float16,  # use FP16 for memory efficiency
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",         # automatically place layers on available GPUs
)
print("\nModel instantiated on device:", next(model.parameters()).device)

# ------------------------------------------------------------
# 3️⃣  Dummy forward pass to inspect tensor shapes
# ------------------------------------------------------------
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt-oss-20b")
text = "The quick brown fox"
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to(next(model.parameters()).device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    print("\nLogits shape:", logits.shape)  # (batch, seq_len, vocab_size)
    print("Vocab size:", logits.size(-1))

# ------------------------------------------------------------
# 4️⃣  Clean up (optional, helps with memory on limited GPUs)
# ------------------------------------------------------------
del model
torch.cuda.empty_cache()



## Step 3: Loading the Model and Tokenizer

In the previous step we saw the architecture of GPT‑OSS‑20B, but we haven’t actually pulled the 20‑billion‑parameter weights into memory yet. This cell shows how to do that safely on a single GPU (or multiple GPUs if you have them) while keeping an eye on memory usage.

### Why do we need a *device map*?
Think of the model as a gigantic Lego set. If you try to build it all on one table that’s too small, the pieces will spill over. A *device map* tells PyTorch which GPU(s) should hold each block of the model, so the pieces stay on the right table.

### Why *FP16*?
FP16 (half‑precision) uses 16 bits instead of 32, cutting memory usage roughly in half and speeding up matrix multiplications. The trade‑off is a tiny loss in numerical precision, which for language generation is usually imperceptible.

### Extra Explanatory Paragraph
**Key terms**:
- **`device_map`** – a dictionary or keyword that maps model layers to GPU devices. `"auto"` lets the `accelerate` library decide.
- **`torch_dtype`** – the data type used for tensors. `torch.float16` (FP16) vs. `torch.float32` (FP32).
- **`gradient_checkpointing`** – a memory‑saving technique that recomputes activations during back‑prop instead of storing them. Useful during fine‑tuning.
- **`load_in_8bit`** – an optional flag that loads weights in 8‑bit integer format, trading a bit of accuracy for a huge memory reduction.

**Rationale & Trade‑offs**:
- *Memory vs. Speed*: FP16 and 8‑bit loading reduce memory but may slightly degrade output quality. For inference, FP16 is a sweet spot.
- *Precision vs. Stability*: FP32 gives the most stable gradients during training, so we keep it for fine‑tuning but switch to FP16 for inference.
- *Device Map vs. Parallelism*: `"auto"` is convenient but may not balance load perfectly on heterogeneous GPUs. For large clusters, consider `balanced` or a custom map.

### What you’ll do in this section
1. Load the tokenizer again (to confirm it works after the architecture load).
2. Load the full model with `device_map="auto"` and `torch_dtype=torch.float16`.
3. Run a tiny inference pass to verify everything is wired correctly.
4. (Optional) Enable gradient checkpointing if you plan to fine‑tune.

Let’s dive in!



In [None]:
# ------------------------------------------------------------
# 1️⃣  Load tokenizer (re‑confirm after architecture load)
# ------------------------------------------------------------
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Reproducibility seed
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Load tokenizer (fast version for speed)
model_name = "gpt-oss-20b"
print("Loading tokenizer…")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=os.getenv("HF_TOKEN"))
    print("Tokenizer loaded. Sample token IDs:", tokenizer("Hello, world!", return_tensors="pt")["input_ids"])  # shape: [1, 3]
except Exception as e:
    print("Error loading tokenizer:", e)

# ------------------------------------------------------------
# 2️⃣  Load the full model with device_map and FP16
# ------------------------------------------------------------
print("\nLoading full GPT‑OSS‑20B model…")
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,          # FP16 for memory efficiency
        device_map="cuda:0" if torch.cuda.is_available() else "cpu",                  # automatically place layers on GPUs
        trust_remote_code=True,             # allow custom model code if needed
        token=os.getenv("HF_TOKEN")
    )
    print("Model loaded on device:", next(model.parameters()).device)
except Exception as e:
    print("Error loading model:", e)

# ------------------------------------------------------------
# 3️⃣  Optional: enable gradient checkpointing for fine‑tuning
# ------------------------------------------------------------
# Uncomment the following lines if you plan to train on a single GPU
# model.gradient_checkpointing_enable()
# print("Gradient checkpointing enabled.")



In [None]:
# ------------------------------------------------------------
# 4️⃣  Quick inference test to verify everything works
# ------------------------------------------------------------
prompt = "Once upon a time, in a land far, far away"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(next(model.parameters()).device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\nGenerated text:\n", generated)

# ------------------------------------------------------------
# 5️⃣  Clean up to free GPU memory (optional but good practice)
# ------------------------------------------------------------
# del model
# torch.cuda.empty_cache()



## Step 4: Running Inference with Prompt Engineering

In the previous step we loaded the 20‑billion‑parameter GPT‑OSS model and verified that it can generate text.  Now we’ll focus on *prompt engineering*—the art of crafting the input string so that the model produces the most useful, accurate, and safe output.

### Why prompt engineering matters
Think of the model as a very knowledgeable but sometimes distracted student.  If you ask a vague question, the student might wander off into unrelated topics.  By giving a clear, structured prompt—just like giving a well‑written homework assignment—you guide the student’s attention and get a more relevant answer.

### Extra explanatory paragraph
**Key terms**:
- **Prompt** – the text you feed to the model before generation starts.  It can be a single sentence, a question, or a longer context.
- **Few‑shot** – a prompt that includes a few examples of the desired input‑output pattern before the new query.
- **Chain‑of‑Thought (CoT)** – a prompt that explicitly asks the model to reason step‑by‑step, improving logical consistency.
- **Temperature** – a sampling hyper‑parameter that controls randomness.  Lower values (≈0.2) make the output deterministic; higher values (≈0.8) increase creativity.
- **Top‑p (nucleus sampling)** – limits sampling to the smallest set of tokens whose cumulative probability exceeds *p*.  It balances diversity and coherence.
- **Repetition penalty** – discourages the model from repeating the same phrase.

**Rationale & trade‑offs**:
- *Determinism vs. creativity*: A low temperature gives consistent answers but can be dull; a high temperature can produce novel but sometimes incoherent text.
- *Prompt length vs. latency*: Longer prompts give the model more context but increase token count, which can slow inference and raise memory usage.
- *Few‑shot vs. zero‑shot*: Few‑shot prompts often improve accuracy on niche tasks but require careful formatting and can bloat the prompt.
- *Safety*: Adding explicit instructions (e.g., "Please avoid disallowed content") can help steer the model away from unsafe outputs, but the model may still hallucinate.

### What you’ll do in this section
1. Define a reusable generation function that accepts prompt‑engineering parameters.
2. Experiment with different prompt styles: zero‑shot, few‑shot, and chain‑of‑thought.
3. Observe how temperature and top‑p affect the output.
4. Verify reproducibility by setting a fixed random seed.

Let’s dive in!



In [None]:
# ------------------------------------------------------------
# 1️⃣  Utility function for inference with prompt‑engineering knobs
# ------------------------------------------------------------
import os
import random
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Reproducibility: same seed → same random numbers
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Load tokenizer & model (assumes previous step loaded them, otherwise reload)
MODEL_NAME = "gpt-oss-20b"
print("Loading tokenizer and model…")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, token=os.getenv("HF_TOKEN"))
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="cuda:0" if torch.cuda.is_available() else "cpu",
        trust_remote_code=True,
        token=os.getenv("HF_TOKEN")
    )
    print("Model loaded on device:", next(model.parameters()).device)
except Exception as e:
    print("Error loading model/tokenizer:", e)
    raise

# ------------------------------------------------------------
# 2️⃣  Generation helper
# ------------------------------------------------------------

def generate_text(
    prompt: str,
    max_new_tokens: int = 50,
    temperature: float = 0.7,
    top_p: float = 0.9,
    do_sample: bool = True,
    repetition_penalty: float = 1.0,
    pad_token_id: int | None = None,
) -> str:
    """Generate text from a prompt using the loaded GPT‑OSS model.

    Parameters
    ----------
    prompt: str
        The input prompt string.
    max_new_tokens: int
        Number of tokens to generate.
    temperature: float
        Controls randomness; lower = more deterministic.
    top_p: float
        Nucleus sampling threshold.
    do_sample: bool
        If False, uses greedy decoding.
    repetition_penalty: float
        Penalises repeated tokens.
    pad_token_id: int | None
        Token id used for padding; defaults to model's eos_token_id.
    """
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(next(model.parameters()).device) for k, v in inputs.items()}

    if pad_token_id is None:
        pad_token_id = tokenizer.eos_token_id

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=do_sample,
            repetition_penalty=repetition_penalty,
            pad_token_id=pad_token_id,
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ------------------------------------------------------------
# 3️⃣  Quick sanity check
# ------------------------------------------------------------
print("\nSanity check – simple prompt:")
print(generate_text("Once upon a time, in a land far, far away", max_new_tokens=20))



In [None]:
# ------------------------------------------------------------
# 4️⃣  Prompt‑engineering experiments
# ------------------------------------------------------------
# 4.1  Zero‑shot prompt (plain question)
zero_shot = "What are the main benefits of using GPT‑OSS for text generation?"
print("\nZero‑shot prompt output:")
print(generate_text(zero_shot, max_new_tokens=60, temperature=0.5))

# 4.2  Few‑shot prompt (provide example Q&A)
few_shot = (
    "Q: What is the capital of France?\n"
    "A: Paris.\n"
    "Q: Who wrote '1984'?\n"
    "A: George Orwell.\n"
    "Q: What are the main benefits of using GPT‑OSS for text generation?\n"
    "A:"
)
print("\nFew‑shot prompt output:")
print(generate_text(few_shot, max_new_tokens=60, temperature=0.5))

# 4.3  Chain‑of‑Thought (CoT) prompt – ask the model to reason step‑by‑step
cot_prompt = (
    "You are a helpful assistant.\n"
    "First, list the steps needed to answer the following question.\n"
    "Then, provide the final answer.\n"
    "\nQuestion: What are the main benefits of using GPT‑OSS for text generation?"
)
print("\nChain‑of‑Thought prompt output:")
print(generate_text(cot_prompt, max_new_tokens=120, temperature=0.7))

# 4.4  Temperature sweep – show effect of randomness
print("\nTemperature sweep (0.2, 0.7, 1.2):")
for temp in [0.2, 0.7, 1.2]:
    out = generate_text(zero_shot, max_new_tokens=40, temperature=temp)
    print(f"Temp={temp}: {out}\n")



## Section 5

Thinking...
>We need to produce JSON with section_number 5, title, content array with markdown and code cells, callouts array, estimated_tokens 1000, prerequisites_check, next_section_hint. Must follow guidelines: 800-1000 tokens per section, beginner-friendly ELI5, analogies, precise terms, extra explanatory paragraph defining key terms and rationale/trade-offs, executable code <=30 lines each, callouts with tip, etc. Must ensure reproducibility seeds, versions. Provide code cells for fine-tuni...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Section 6

Thinking...
>We need to produce JSON for section 6. Must follow guidelines: 800-1000 tokens, beginner-friendly ELI5, analogies, precise terms, extra explanatory paragraph defining key terms and rationale/trade-offs, code cells <=30 lines each, callouts. Provide reproducibility seeds, versions. Provide content array with markdown and code cells. Provide callouts array. Provide estimated_tokens 1000. Provide prerequisites_check. Provide next_section_hint.
>
>We need to produce content for Step 6: ...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which of the following is NOT a recommended strategy for reducing inference latency on GPT‑OSS‑20B?", ["Using FP16 precision","Increasing the batch size beyond GPU memory limits","Applying TorchScript","Enabling gradient checkpointing during inference"], 1, "Increasing batch size beyond GPU memory limits will cause out‑of‑memory errors and actually increase latency due to swapping.")


In [None]:
render_mcq("What is the primary benefit of using the Hugging Face Trainer for fine‑tuning?", ["Automatic mixed‑precision training","Built‑in support for distributed training","Simplified evaluation loop","All of the above"], 3, "The Trainer abstracts many complexities, providing mixed‑precision, distributed training, and evaluation hooks out of the box.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
