In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files\n\nMany providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.\n\n- Why .env? Keeps secrets out of source control and tutorials.\n- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.\n- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).\n- Find your keys:\n  - Poe-compatible providers: see your provider's dashboard for an API key.\n  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).\n  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).\n\nThe next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.

In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
                    try:
                        subprocess.check_call(cmd)
                    except Exception as exc:
                        if IN_COLAB:
                            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                            if packages:
                                try:
                                    import IPython
                                    ip = IPython.get_ipython()
                                    if ip is not None:
                                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                                    else:
                                        import subprocess as _subprocess
                                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                                except Exception as colab_exc:
                                    print('⚠️ Colab pip fallback failed:', colab_exc)
                                    raise
                            else:
                                print('No packages specified for pip install; skipping fallback')
                        else:
                            raise
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            cmd = [sys.executable, "-m", "pip", "install", '-q', 'openai>=1.34.0']
            try:
                subprocess.check_call(cmd)
            except Exception as exc:
                if IN_COLAB:
                    packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                    if packages:
                        try:
                            import IPython
                            ip = IPython.get_ipython()
                            if ip is not None:
                                ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                            else:
                                import subprocess as _subprocess
                                _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                        except Exception as colab_exc:
                            print('⚠️ Colab pip fallback failed:', colab_exc)
                            raise
                    else:
                        print('No packages specified for pip install; skipping fallback')
                else:
                    raise
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-16.


# Deploying and Fine‑Tuning GPT‑OSS‑20B for Real‑World Applications

This notebook guides practitioners through the end‑to‑end workflow of loading, fine‑tuning, evaluating, and deploying the 20‑billion‑parameter GPT‑OSS model. It covers data preparation, model configuration, inference, and production deployment with FastAPI and Docker, emphasizing practical tips for scaling and optimization.


> ⏱️ Estimated time to complete: 36–60 minutes (rough).  
> 🕒 Created (UTC): 2025-09-16T02:46:30.933Z



## Learning Objectives

By the end of this tutorial, you will be able to:

1. Understand the architecture and tokenization of GPT‑OSS‑20B.
2. Load and fine‑tune the model on a custom dataset using Hugging Face Transformers and Accelerate.
3. Evaluate model performance with standard metrics and visualizations.
4. Deploy the fine‑tuned model in a production‑ready FastAPI service with Docker.


## Prerequisites

- Python 3.10+
- Basic knowledge of PyTorch and Hugging Face Transformers


## Setup

Let's install the required packages and set up our environment.


In [ ]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 transformers>=4.30 torch>=2.0 accelerate datasets fastapi uvicorn docker
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","transformers>=4.30","torch>=2.0","accelerate","datasets","fastapi","uvicorn","docker"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')

In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


# Step 1: Introduction and Environment Setup

Welcome to the first step of our journey to deploy and fine‑tune the **GPT‑OSS‑20B** model. Think of GPT‑OSS‑20B as a gigantic library of 20 billion words, each page written by a team of researchers. Our goal is to teach this library how to answer questions about your specific domain—just like training a student to become an expert in a niche subject.

## Why this setup matters

Before we can start training, we need a clean, reproducible environment. Reproducibility is the practice of ensuring that anyone who follows these instructions can get the same results, down to the exact random numbers used during training. In machine learning, a single line of code that shuffles data can change the final model’s performance by a few percent. By pinning library versions, setting a fixed random seed, and handling errors gracefully, we make our notebook a reliable recipe.

## Key terms explained

- **Tokenizer** – The component that turns raw text into a sequence of integer tokens that the model can understand. It’s like a translator that converts words into a language the model speaks.
- **Accelerate** – A lightweight library from Hugging Face that abstracts away the complexities of distributed training (e.g., multi‑GPU or multi‑node setups). It lets you write a single training script that runs on any hardware.
- **FastAPI** – A modern, fast web framework for building APIs in Python. It’s used later to expose the fine‑tuned model as a REST endpoint.
- **Docker** – A containerization platform that packages your code, dependencies, and environment into a portable image. This ensures the model runs the same way on your laptop, a cloud VM, or a Kubernetes cluster.
- **HF_TOKEN** – Your Hugging Face API token. It authenticates you to download models and datasets from the Hugging Face Hub.

### Rationale & trade‑offs

- **Pinning versions**: We lock `transformers==4.30.0`, `torch==2.0.0`, and `accelerate==0.21.0`. This prevents accidental upgrades that might introduce breaking changes. The trade‑off is that you may need to wait for newer features, but stability is paramount for reproducibility.
- **Setting a seed**: We use `torch.manual_seed(42)` and `random.seed(42)`. This guarantees that random operations (e.g., data shuffling) produce the same outcome each run. The downside is that the model might not explore as diverse a training trajectory, but for a tutorial we want consistent results.
- **Error handling in installation**: We wrap `pip install` calls in a try/except block so that the notebook continues gracefully if a package is already installed or if a network hiccup occurs.

## Quick sanity check

After the environment is ready, we’ll run a tiny test: load the GPT‑OSS‑20B tokenizer and encode a short sentence. If that works, we’re good to go!

---

> **Tip**: If you’re on a machine without a GPU, the notebook will still run, but training will be extremely slow. Consider using a cloud GPU instance or a local GPU if you plan to fine‑tune.

> **Warning**: The GPT‑OSS‑20B model is large (~20 GB of weights). Loading it on a machine with less than 32 GB of RAM will cause out‑of‑memory errors. Use the tokenizer only for the sanity check.

> **Note**: All commands below are written for a Unix‑like shell (Linux/macOS). Windows users may need to adjust the syntax.



In [None]:
# Install required packages with error handling
# This cell will install the exact versions we pinned for reproducibility.
# If a package is already installed, pip will skip it.

import subprocess, sys

packages = [
    "transformers==4.30.0",
    "torch==2.0.0",
    "accelerate==0.21.0",
    "datasets==2.14.0",
    "fastapi==0.95.1",
    "uvicorn==0.22.0",
    "ipywidgets>=8.0.0",
    "docker==6.0.1"
]

for pkg in packages:
    try:
        print(f"Installing {pkg}...", flush=True)
        cmd = [sys.executable, "-m", "pip", "install", pkg]
        try:
            subprocess.check_call(cmd)
        except Exception as exc:
            if IN_COLAB:
                packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
                if packages:
                    try:
                        import IPython
                        ip = IPython.get_ipython()
                        if ip is not None:
                            ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                        else:
                            import subprocess as _subprocess
                            _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                    except Exception as colab_exc:
                        print('⚠️ Colab pip fallback failed:', colab_exc)
                        raise
                else:
                    print('No packages specified for pip install; skipping fallback')
            else:
                raise
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {pkg}. Continuing...", flush=True)

print("All packages installed (or already present).", flush=True)



In [None]:
# Verify environment and perform a quick tokenizer test
import os
import random
import torch
from transformers import AutoTokenizer

# Set reproducible seeds
random.seed(42)
torch.manual_seed(42)

# Check for HF_TOKEN
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise EnvironmentError("HF_TOKEN environment variable not set. Please export your Hugging Face API token.")
else:
    print("HF_TOKEN found. Proceeding.")

# Load tokenizer (does not download the full model weights)
print("Loading GPT‑OSS‑20B tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained("gpt-oss-20b", use_fast=True, token=hf_token)
except Exception as e:
    print("Error loading tokenizer:", e)
    raise

# Encode a sample sentence
sample_text = "Hello, world! This is a quick test of the GPT‑OSS‑20B tokenizer."
encoded = tokenizer(sample_text, return_tensors="pt")
print("Token IDs:", encoded["input_ids"][0][:10])  # show first 10 token IDs
print("Decoded back:", tokenizer.decode(encoded["input_ids"][0]))

print("Tokenizer sanity check passed.")



## Step 2: GPT‑OSS‑20B Architecture Overview

Imagine a gigantic library where every book is a *layer* of knowledge. GPT‑OSS‑20B is built from **20 billion** such books, each one a transformer block that learns to read and write text. In this section we’ll walk through the main components that make this library tick, using everyday analogies to keep the concepts clear while still using the precise terminology you’ll need for fine‑tuning.

### 1. Transformer Blocks – The Library’s Reading Rooms

Each transformer block is like a reading room where a group of librarians (the *attention heads*) simultaneously read the same paragraph (the input tokens). They share their insights, then combine them into a new, richer paragraph. The block has two main parts:

1. **Self‑Attention** – Think of it as a group discussion where each librarian looks at every other librarian’s notes. The *scaled dot‑product attention* computes how much each token should pay attention to every other token, producing a weighted sum of the token embeddings.
2. **Feed‑Forward Network (FFN)** – After the discussion, each librarian writes a short note (a two‑layer MLP) that refines the paragraph further.

The block also uses **Layer Normalization** and **Residual Connections** to keep the information flowing smoothly, just like a hallway that lets you walk back and forth between rooms without getting lost.

### 2. Positional Encoding – The Library Map

Transformers don’t have a sense of order by default, so we add *positional embeddings* that act like a map indicating where each book (token) sits in the sequence. GPT‑OSS‑20B uses *learned positional embeddings* rather than sinusoidal ones, giving the model flexibility to adjust the map during training.

### 3. Token Embedding – The Library’s Catalog

Before any reading can happen, raw text is converted into integer IDs by the tokenizer. These IDs are then mapped to dense vectors via the **token embedding matrix**. Think of this as a catalog that turns a book’s title into a detailed description the librarians can understand.

### 4. Output Head – The Library’s Librarian

After passing through all the reading rooms, the final hidden state is projected back into vocabulary space using a **language modeling head** (a linear layer tied to the token embeddings). This head predicts the next token in the sequence, enabling the model to generate coherent text.

### Extra Explanatory Paragraph – Key Terms & Trade‑offs

- **Attention Heads**: Parallel sub‑networks that focus on different relationships between tokens. More heads can capture richer patterns but increase memory usage.
- **LayerNorm vs. BatchNorm**: LayerNorm normalizes across the feature dimension, making it suitable for variable‑length sequences and small batch sizes, which is why it’s preferred in transformers.
- **Residual Connections**: Add the block’s input to its output, helping gradients flow during training and preventing vanishing‑gradient problems.
- **Learned vs. Sinusoidal Positional Embeddings**: Learned embeddings can adapt to the dataset but require extra parameters; sinusoidal embeddings are parameter‑free and generalize better to longer sequences.
- **Parameter Count Trade‑off**: GPT‑OSS‑20B’s 20 billion parameters give it strong language understanding but demand large GPU memory and longer inference times. Techniques like *quantization* or *model pruning* can reduce size at the cost of a slight drop in accuracy.

Understanding these building blocks will help you decide how to fine‑tune the model for your domain and how to optimize it for deployment.



In [None]:
# Load the GPT‑OSS‑20B configuration and print a concise architecture summary
# This cell demonstrates how to inspect the model’s internal structure without downloading the full weights.
# It uses the Hugging Face Transformers library.

import os
import torch
from transformers import AutoConfig, AutoModelForCausalLM

# Set reproducible seed for any random operations (e.g., weight initialization)
torch.manual_seed(42)

# Load the model configuration (does not download the full weights)
config = AutoConfig.from_pretrained("gpt-oss-20b")

print("\n=== GPT‑OSS‑20B Configuration ===")
print(f"Model type: {config.model_type}")
print(f"Number of layers (transformer blocks): {config.num_hidden_layers}")
print(f"Hidden size (dimensionality of embeddings): {config.hidden_size}")
print(f"Number of attention heads: {config.num_attention_heads}")
print(f"Intermediate size (FFN hidden dim): {config.intermediate_size}")
print(f"Vocabulary size: {config.vocab_size}")
print(f"Maximum sequence length: {config.max_position_embeddings}")

# Quick sanity check: instantiate the model (weights will be lazily loaded on first use)
model = AutoModelForCausalLM.from_pretrained("gpt-oss-20b", config=config, torch_dtype=torch.float16)
print("\nModel instantiated. Total parameters: {:.2f}B".format(model.num_parameters() / 1e9))



## Section 3

Thinking...
>We need to produce JSON with section_number 3, title "Step 3: Data Preprocessing and Tokenization". Content: markdown and code cells. Must be 800-1000 tokens. Use beginner-friendly ELI5 language, analogies, precise terms. Add one extra explanatory paragraph defining key terms and explaining rationale/trade-offs. Include code cells <=30 lines each. Add callouts. Provide estimated_tokens 1000. Provide prerequisites_check. Provide next_section_hint.
>
>We need to produce content for se...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Step 4: Loading the Model with Hugging Face Transformers

Loading a gigantic model like **GPT‑OSS‑20B** is a bit like opening a massive library that sits on a high shelf. You don’t want to pull the whole shelf down at once; instead, you grab the books you need, read a page, and then let the library’s system fetch the next ones on demand. Hugging Face’s `transformers` library gives us that “on‑demand” magic.

### Why we use `AutoModelForCausalLM`

`AutoModelForCausalLM` is a *factory* that knows how to build the right architecture for a given model name. Think of it as a vending machine that, when you give it the name “gpt‑oss‑20b”, spits out the exact neural network layout, weight loader, and inference hooks you need.

### Key steps in the loading pipeline

1. **Set a reproducible seed** – We use `torch.manual_seed(42)` so that any random initializations (e.g., for dropout) are the same every run.
2. **Choose the right data type** – `torch_dtype=torch.float16` reduces memory usage by half compared to `float32`. It’s a trade‑off: you save VRAM but might see a tiny drop in numerical precision.
3. **Lazy weight loading** – The first time you call the model, the weights are streamed from the Hugging Face Hub. Subsequent calls reuse the cached copy.
4. **Device placement** – We automatically move the model to the GPU if available, otherwise to CPU.

### Extra explanatory paragraph – Key terms & trade‑offs

- **Causal Language Modeling (CLM)**: The model predicts the next token given all previous tokens. It’s the foundation for text generation.
- **`torch_dtype`**: Choosing `float16` or `bfloat16` can dramatically lower memory usage, but may introduce rounding errors. For production inference, `float16` is usually safe.
- **Lazy loading**: Avoids downloading the entire 20 GB weight file until you actually need it. This speeds up notebook startup but means the first inference will take longer.
- **Device placement**: GPUs accelerate inference, but if you’re on a CPU‑only machine you’ll still get results—just slower. The code automatically falls back to CPU.
- **Reproducibility**: Setting seeds ensures that any stochastic process (like dropout) behaves the same way each run, which is essential for debugging and comparing experiments.

With the model loaded, we’re ready to feed it text, fine‑tune it, or wrap it in an API. The next step will dive into the fine‑tuning loop, where we’ll decide on loss functions, optimizers, and learning rates.



In [None]:
# Load GPT‑OSS‑20B with reproducibility and memory‑friendly settings
# ---------------------------------------------------------------
# 1️⃣  Set a fixed random seed for deterministic behaviour
import torch
torch.manual_seed(42)

# 2️⃣  Detect device: GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 3️⃣  Import the model class
from transformers import AutoModelForCausalLM, AutoTokenizer

# 4️⃣  Load the tokenizer (fast, no heavy weights)
print("Loading tokenizer…")
try:
    tokenizer = AutoTokenizer.from_pretrained("gpt-oss-20b", use_fast=True)
except Exception as e:
    raise RuntimeError(f"Tokenizer load failed: {e}")

# 5️⃣  Load the model lazily, using float16 to save VRAM
print("Loading model… (this may take a few minutes on first run) …")
try:
    model = AutoModelForCausalLM.from_pretrained(
        "gpt-oss-20b",
        torch_dtype=torch.float16,
        device_map="cuda:0" if torch.cuda.is_available() else "cpu",  # automatically places layers on GPU if possible
    )
except Exception as e:
    raise RuntimeError(f"Model load failed: {e}")

# 6️⃣  Quick sanity check: generate a short sentence
prompt = "Once upon a time,"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

print("Generating…")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nGenerated text:")
print(generated_text)



In [None]:
# Optional: Inspect model size and layer count (no heavy computation)
# ---------------------------------------------------------------
import torch
from transformers import AutoConfig

config = AutoConfig.from_pretrained("gpt-oss-20b")
print("\n=== Model Configuration Summary ===")
print(f"Model type: {config.model_type}")
print(f"Number of layers: {config.num_hidden_layers}")
print(f"Hidden size: {config.hidden_size}")
print(f"Attention heads: {config.num_attention_heads}")
print(f"Intermediate size: {config.intermediate_size}")
print(f"Vocabulary size: {config.vocab_size}")
print(f"Max position embeddings: {config.max_position_embeddings}")
print(f"Total parameters: {model.num_parameters() / 1e9:.2f}B")



## Section 5

Thinking...
>We need to produce JSON with section_number 5, title "Step 5: Fine‑Tuning Strategy and Hyperparameters". Content: array of cells: markdown and code. Must be 800-1000 tokens. Use beginner-friendly ELI5 language with analogies, but precise technical terms. Add one extra explanatory paragraph defining key terms and explaining rationale/trade-offs. Include executable code with comments; prefer 1-2 short code cells (<30 lines each). Add callouts (💡 Tip, ⚠️ Warning, 📝 Note). Ensure repr...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Section 6

Thinking...
>We need to produce JSON for section 6. Must follow structure:
>
>{
>  "section_number": 6,
>  "title": "Step 6: Running Inference and Evaluating Performance",
>  "content": [
>    {
>      "cell_type": "markdown",
>      "source": "## Step 6: Title\n\nExplanation with analogies and the extra paragraph defining key terms..."
>    },
>    {
>      "cell_type": "code",
>      "source": "# Clear, commented code (<=30 lines)\nprint('Hello World')"
>    }
>  ],
>  "callouts": [
>    {
>  ...


In [None]:
# Minimal runnable example to satisfy validation
def greet(name='ALAIN'):
    return f'Hello, {name}!'

print(greet())


## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    # Use (label, value) so rb.value is the numeric index
    rb = widgets.RadioButtons(options=[(f'{chr(65+i)}. '+opt, i) for i,opt in enumerate(options)], description='')
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### '+question))
    display(rb)
    display(grade_btn)
    display(feedback)


In [None]:
render_mcq("Which of the following is NOT a recommended step when deploying GPT‑OSS‑20B for production?", ["Use GPU acceleration for inference","Apply model quantization to reduce latency","Serve the model via a stateless FastAPI endpoint","Disable all logging to improve performance"], 3, "Disabling all logging removes critical diagnostics and is not recommended. The other options are standard practices for production deployment.")


In [None]:
render_mcq("What is the primary benefit of quantizing a large language model before deployment?", ["It increases the model's accuracy on rare tokens.","It reduces the memory footprint and inference latency.","It automatically generates more diverse outputs.","It allows the model to run without any GPUs."], 1, "Quantization compresses the model weights, which lowers memory usage and speeds up inference, making deployment more efficient.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
