In [ ]:
# Environment Detection
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f'Environment: {"Colab" if IN_COLAB else "Local"}')


> Generated by ALAIN (Applied Learning AI Notebooks) — 2025-09-19.


In [None]:
# 🔧 Environment Detection and Setup
import sys
import os

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
env_label = 'Google Colab' if IN_COLAB else 'Local'
print(f'Environment: {env_label}')

# Setup environment-specific configurations
if IN_COLAB:
    print('📝 Colab-specific optimizations enabled')
    try:
        from google.colab import output
        output.enable_custom_widget_manager()
    except Exception:
        pass


## API Keys and .env Files

Many providers require API keys. Do not hardcode secrets in notebooks. Use a local .env file that the notebook loads at runtime.

- Why .env? Keeps secrets out of source control and tutorials.
- Where? Place `.env.local` (preferred) or `.env` in the same folder as this notebook. `.env.local` overrides `.env`.
- What keys? Common: `POE_API_KEY` (Poe-compatible servers), `OPENAI_API_KEY` (OpenAI-compatible), `HF_TOKEN` (Hugging Face).
- Find your keys:
  - Poe-compatible providers: see your provider's dashboard for an API key.
  - Hugging Face: create a token at https://huggingface.co/settings/tokens (read scope is usually enough).
  - Local servers: you may not need a key; set `OPENAI_BASE_URL` instead (e.g., http://localhost:1234/v1).

The next cell will: load `.env.local`/`.env`, prompt for missing keys, and optionally write `.env.local` with secure permissions so future runs just work.


In [None]:
# 🔐 Load and manage secrets from .env\n# This cell will: (1) load .env.local/.env, (2) prompt for missing keys, (3) optionally write .env.local (0600).\n# Location: place your .env files next to this notebook (recommended) or at project root.\n# Disable writing: set SAVE_TO_ENV = False below.\nimport os, pathlib\nfrom getpass import getpass\n\n# Install python-dotenv if missing\ntry:\n    import dotenv  # type: ignore\nexcept Exception:\n    import sys, subprocess\n    if 'IN_COLAB' in globals() and IN_COLAB:\n        try:\n            import IPython\n            ip = IPython.get_ipython()\n            if ip is not None:\n                ip.run_line_magic('pip', 'install -q python-dotenv>=1.0.0')\n            else:\n                subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n        except Exception as colab_exc:\n            print('⚠️ Colab pip fallback failed:', colab_exc)\n            raise\n    else:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'python-dotenv>=1.0.0'])\n    import dotenv  # type: ignore\n\n# Prefer .env.local over .env\ncwd = pathlib.Path.cwd()\nenv_local = cwd / '.env.local'\nenv_file = cwd / '.env'\nchosen = env_local if env_local.exists() else (env_file if env_file.exists() else None)\nif chosen:\n    dotenv.load_dotenv(dotenv_path=str(chosen))\n    print(f'Loaded env from {chosen.name}')\nelse:\n    print('No .env.local or .env found; will prompt for keys.')\n\n# Keys we might use in this notebook\nkeys = ['POE_API_KEY', 'OPENAI_API_KEY', 'HF_TOKEN']\nmissing = [k for k in keys if not os.environ.get(k)]\nfor k in missing:\n    val = getpass(f'Enter {k} (hidden, press Enter to skip): ')\n    if val:\n        os.environ[k] = val\n\n# Decide whether to persist to .env.local for convenience\nSAVE_TO_ENV = True  # set False to disable writing\nif SAVE_TO_ENV:\n    target = env_local\n    existing = {}\n    if target.exists():\n        try:\n            for line in target.read_text().splitlines():\n                if not line.strip() or line.strip().startswith('#') or '=' not in line:\n                    continue\n                k,v = line.split('=',1)\n                existing[k.strip()] = v.strip()\n        except Exception:\n            pass\n    for k in keys:\n        v = os.environ.get(k)\n        if v:\n            existing[k] = v\n    lines = []\n    for k,v in existing.items():\n        # Always quote; escape backslashes and double quotes for safety\n        escaped = v.replace("\\", "\\\\")\n        escaped = escaped.replace("\"", "\\"")\n        vv = f'"{escaped}"'\n        lines.append(f"{k}={vv}")\n    target.write_text('\\n'.join(lines) + '\\n')\n    try:\n        target.chmod(0o600)  # 600\n    except Exception:\n        pass\n    print(f'🔏 Wrote secrets to {target.name} (permissions 600)')\n\n# Simple recap (masked)\ndef mask(v):\n    if not v: return '∅'\n    return v[:3] + '…' + v[-2:] if len(v) > 6 else '•••'\nfor k in keys:\n    print(f'{k}:', mask(os.environ.get(k)))\n

In [None]:
# 🌐 ALAIN Provider Setup (Poe/OpenAI-compatible)
# About keys: If you have POE_API_KEY, this cell maps it to OPENAI_API_KEY and sets OPENAI_BASE_URL to Poe.
# Otherwise, set OPENAI_API_KEY (and optionally OPENAI_BASE_URL for local/self-hosted servers).
import os
try:
    # Prefer Poe; fall back to OPENAI_API_KEY if set
    poe = os.environ.get('POE_API_KEY')
    if poe:
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
        os.environ.setdefault('OPENAI_API_KEY', poe)
    # Prompt if no key present
    if not os.environ.get('OPENAI_API_KEY'):
        from getpass import getpass
        os.environ['OPENAI_API_KEY'] = getpass('Enter POE_API_KEY (input hidden): ')
        os.environ.setdefault('OPENAI_BASE_URL', 'https://api.poe.com/v1')
    # Ensure openai client is installed
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        import sys, subprocess
        if 'IN_COLAB' in globals() and IN_COLAB:
            try:
                import IPython
                ip = IPython.get_ipython()
                if ip is not None:
                    ip.run_line_magic('pip', 'install -q openai>=1.34.0')
                else:
                    subprocess.check_call([sys.executable, '-m', ''pip'', ''install'', ''-q'', ''openai>=1.34.0'])
            except Exception as colab_exc:
                print('⚠️ Colab pip fallback failed:', colab_exc)
                raise
        else:
            subprocess.check_call([sys.executable, '-m', ''pip'', ''install'', ''-q'', ''openai>=1.34.0'])
        from openai import OpenAI  # type: ignore
    # Create client
    from openai import OpenAI
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    print('✅ Provider ready:', os.environ.get('OPENAI_BASE_URL'))
except Exception as e:
    print('⚠️ Provider setup failed:', e)


In [None]:
# 🔎 Provider Smoke Test (1-token)
import os
model = os.environ.get('ALAIN_MODEL') or 'gpt-4o-mini'
if 'client' not in globals():
    print('⚠️ Provider client not available; skipping smoke test')
else:
    try:
        resp = client.chat.completions.create(model=model, messages=[{"role":"user","content":"ping"}], max_tokens=1)
        print('✅ Smoke OK:', resp.choices[0].message.content)
    except Exception as e:
        print('⚠️ Smoke test failed:', e)


## Local Transformer Runtime Tips

- Install the optimized stack with `pip install -U transformers kernels accelerate triton` (PyTorch >= 2.8 already bundles Triton 3.4).
- Load GPT-OSS with downloadable kernels to compare bf16 vs MXFP4 memory usage:
```python
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "openai/gpt-oss-20b",
    dtype="auto",
    device_map="auto",
    use_kernels=True,
)
```
- Hopper GPUs can enable Flash Attention 3 sinks via `attn_implementation="kernels-community/vllm-flash-attn3"`.
- If MXFP4 kernels are unavailable, Transformers automatically falls back to bf16; monitor VRAM and throughput to pick the best mode.


# Magistral‑Small‑2509 Introduction

A gentle, analogy‑driven walk‑through of the small Magistral model, covering installation, basic usage, and licensing. Designed for absolute beginners who want to see a model in action without deep coding knowledge.

## Learning Objectives

By the end of this tutorial, you will be able to:

1. Explain what the Magistral‑Small‑2509 model is and why it’s useful.
2. Show how to install the required libraries and launch the model with vLLM.
3. Demonstrate how to call the model from a simple Python client.
4. Clarify the Apache‑2.0 license and proper attribution practices.


## Prerequisites

- Basic familiarity with a terminal or command prompt.
- Anaconda/Miniconda or a recent Python 3.10+ installation.


## Setup

Let's install the required packages and set up our environment.


In [None]:
# Install packages (Colab-compatible)
# Check if we're in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install -q ipywidgets>=8.0.0 vllm mistralai transformers
else:
    import subprocess
    cmd = [sys.executable, "-m", "pip", "install"] + ["ipywidgets>=8.0.0","vllm","mistralai","transformers"]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

print('✅ Packages installed!')


In [None]:
# Ensure ipywidgets is installed for interactive MCQs
try:
    import ipywidgets  # type: ignore
    print('ipywidgets available')
except Exception:
    import sys, subprocess
    cmd = [sys.executable, "-m", "pip", "install", '-q', 'ipywidgets>=8.0.0']
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise


Welcome to the first step of our journey into large language models. Think of a language model as a very sophisticated autocomplete that has read billions of words and can predict what comes next in a sentence. It works by learning statistical patterns in text: the probability of a word given the words that precede it. In practice, a language model takes a prompt, tokenizes it into sub‑word units, feeds those tokens through a deep neural network, and outputs a probability distribution over the next token. By sampling from that distribution repeatedly, the model can generate coherent paragraphs, code, or even poetry.

A key concept is the *token*. Tokens are the smallest units the model processes; they can be characters, sub‑words, or whole words. Modern models use Byte‑Pair Encoding (BPE) or similar sub‑word tokenizers to balance vocabulary size and expressiveness. Training a language model involves minimizing the cross‑entropy loss between the model’s predictions and the actual next token in a massive corpus. The result is a model that can generalize to new prompts, even if it has never seen them before. In this notebook we will see how to load such a model, run inference locally, and understand the parameters that control its behavior.

We’ll start with a quick environment check, then move on to a simple inference demo. By the end of this section you’ll understand what a language model is, how it works under the hood, and how to get it running on your machine.

---

**Quick verification steps**:

1. Run the environment verification cell below.
2. After it finishes, you should see the installed library versions and whether a CUDA device is available.
3. If you see "No CUDA device detected", the demo will fall back to CPU.
4. Once the environment is verified, the inference demo will generate a short story fragment.

Feel free to experiment with the prompt or generation parameters in the demo cell.

---

In the next step we’ll install the required packages and set up the environment for running the model locally.

Explain how the `HF_HOME` cache directory is configured so learners can control checkpoint storage.


In this section we also discuss the difference between running a model locally versus using a hosted API like Poe. A local run gives you full control over the model, no network latency, and no usage limits, but it requires a compatible GPU and sufficient VRAM. Hosted APIs abstract away the hardware, provide instant access, and often include safety filters, but they incur network costs and may have rate limits.

When you run locally, you must manage environment variables such as `HF_HOME` (where Hugging Face caches models) and `HF_TOKEN` (if the model is private). Setting `HF_HOME` to a directory with ample disk space ensures that large checkpoints do not clutter your home directory. For GPU drivers, make sure you have CUDA 12.1 or later and the corresponding cuDNN version; mismatched drivers are a common source of errors.

If you prefer a quick start, the hosted API route is simpler: you just send a prompt and receive a response. However, for reproducibility and privacy, a local setup is preferable. The code cells below will show you how to verify your environment and run a small inference demo.

---

**Troubleshooting checklist**:

- **GPU memory**: If you get an out‑of‑memory error, reduce `max_new_tokens` or use a smaller batch.
- **Driver mismatch**: Verify `nvcc --version` matches the CUDA toolkit used by PyTorch.
- **Tokenizer errors**: Ensure the tokenizer name matches the model name; mismatches can lead to decoding errors.

---

We’ll now dive into the code cells that perform these checks and demonstrate inference.


### Next up
We will transition into Install the Required Packages. Verify that the environment verification cell prints GPU details, the inference demo returns coherent text, and telemetry has been captured for Harmony playback. Bring any questions about model configuration to the next section so we can compare advanced scheduling options.


### License and Model Details

- **Model ID:** `mistralai/Magistral-Small-2509`
- **License:** Apache-2.0 (commercial and non-commercial use permitted); see the model card for terms and attribution.
- **Context Window:** 128k (performance may degrade past ~40k; keep max length unless you observe slowdowns).
- **Reasoning:** Uses special THINK tokens; system prompt provided in `SYSTEM_PROMPT.txt` (loaded via `hf_hub_download`).


In [None]:
# Environment verification
# Pinning versions ensures reproducibility
import subprocess, sys

def install(package):
    cmd = [sys.executable, "-m", "pip", "install", package]
    try:
        subprocess.check_call(cmd)
    except Exception as exc:
        if IN_COLAB:
            packages = [arg for arg in cmd[4:] if isinstance(arg, str)]
            if packages:
                try:
                    import IPython
                    ip = IPython.get_ipython()
                    if ip is not None:
                        ip.run_line_magic('pip', 'install ' + ' '.join(packages))
                    else:
                        import subprocess as _subprocess
                        _subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
                except Exception as colab_exc:
                    print('⚠️ Colab pip fallback failed:', colab_exc)
                    raise
            else:
                print('No packages specified for pip install; skipping fallback')
        else:
            raise

packages = [
    "transformers==4.41.2",
    "accelerate==0.28.0",
    "torch==2.2.0",
    "ipywidgets==8.1.2"
]

for pkg in packages:
    try:
        __import__(pkg.split("==")[0])
    except ImportError:
        install(pkg)

# Verify imports
import torch
from transformers import __version__ as transformers_version
from accelerate import __version__ as accelerate_version
import ipywidgets

print(f"torch version: {torch.__version__}")
print(f"transformers version: {transformers_version}")
print(f"accelerate version: {accelerate_version}")
print(f"ipywidgets version: {ipywidgets.__version__}")

# GPU status
if torch.cuda.is_available():
    device = torch.device("cuda")
    props = torch.cuda.get_device_properties(device)
    print(f"CUDA device: {props.name}")
    print(f"Total memory: {props.total_memory / (1024**3):.2f} GB")
else:
    print("No CUDA device detected. Using CPU.")


In [None]:
# Inference demo with gpt-oss-20b
import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set deterministic seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

model_name = "gpt-oss-20b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Simple prompt
prompt = "Once upon a time in a distant galaxy,"

# Encode prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate text
with torch.no_grad():
    output_ids = model.generate(
        input_ids,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)


## Knowledge Check (Interactive)

Use the widgets below to select an answer and click Grade to see feedback.


In [None]:
# MCQ helper (ipywidgets)
import ipywidgets as widgets
from IPython.display import display, Markdown

def render_mcq(question, options, correct_index, explanation):
    cleaned = []
    for idx, raw in enumerate(options):
        text = str(raw or "").strip()
        prefix = f"{chr(65+idx)}. "
        if not text.lower().startswith(prefix.lower()):
            text = prefix + text
        cleaned.append(text)
    rb = widgets.RadioButtons(options=[(label, idx) for idx, label in enumerate(cleaned)], description="")
    grade_btn = widgets.Button(description='Grade', button_style='primary')
    feedback = widgets.HTML(value='')
    def on_grade(_):
        sel = rb.value
        if sel is None:
            feedback.value = '<p>⚠️ Please select an option.</p>'
            return
        if sel == correct_index:
            feedback.value = '<p>✅ Correct!</p>'
        else:
            feedback.value = f'<p>❌ Incorrect. Correct answer is {chr(65+correct_index)}.</p>'
        feedback.value += f'<div><em>Explanation:</em> {explanation}</div>'
    grade_btn.on_click(on_grade)
    display(Markdown('### ' + question))
    display(rb)
    display(grade_btn)
    display(feedback)


### Knowledge Check – Step 1: Welcome & What Is a Language Model?


In [None]:
render_mcq("Which command is used to start the vLLM server for Magistral‑Small‑2509?", ["vllm serve mistralai/Magistral-Small-2509","vllm start mistralai/Magistral-Small-2509","vllm run mistralai/Magistral-Small-2509","vllm launch mistralai/Magistral-Small-2509"], 0, "The correct syntax is `vllm serve <model_name>`. The other options are not valid vLLM commands.")


In [None]:
render_mcq("What does a temperature of 0.7 do in text generation?", ["Makes the output deterministic.","Makes the output more random.","Limits the length of the output.","Enforces strict adherence to the prompt."], 1, "Temperature controls randomness; 0.7 is a moderate value that balances creativity and coherence.")


In [None]:
render_mcq("Under the Apache‑2.0 license, which of the following is required when redistributing the model?", ["You must pay a royalty fee.","You must include the original license text.","You must provide a link to the source code.","You must rename the model."], 1, "Apache‑2.0 requires that the license text be included with any redistribution.")


In [None]:
render_mcq("Which of these is a common pitfall when running vLLM on a GPU with limited memory?", ["Using a very high batch size.","Setting temperature to 0.","Running the server in CPU mode.","Using the wrong model name."], 0, "Large batch sizes can exhaust GPU memory; adjust batch size or use CPU mode if necessary.")


## 🔧 Troubleshooting Guide

### Common Issues:

1. **Out of Memory Error**
   - Enable GPU: Runtime → Change runtime type → GPU
   - Restart runtime if needed

2. **Package Installation Issues**
   - Restart runtime after installing packages
   - Use `!pip install -q` for quiet installation

3. **Model Loading Fails**
   - Check internet connection
   - Verify authentication tokens
   - Try CPU-only mode if GPU fails
