# WebLLM from Python (JupyterLite)

This notebook runs entirely in the browser with JupyterLite. It shows how to spin up a WebLLM worker and await responses from Python code running on Pyodide.

The first cell loads the WebLLM library and registers a helper on the page.
The helper hides the worker boilerplate so the rest of the notebook can focus on agent logic.

In [None]:
import js

await js.eval("""
(async () => {
  if (!globalThis.webllmHelpers) {
    const workerSource = `
      import { WebWorkerMLCEngineHandler } from \"https://esm.run/@mlc-ai/web-llm\";
      const handler = new WebWorkerMLCEngineHandler();
      self.onmessage = (msg) => handler.onmessage(msg);
    `;
    const module = await import("https://esm.run/@mlc-ai/web-llm");
    const workerURL = URL.createObjectURL(new Blob([workerSource], { type: "application/javascript" }));
    globalThis.webllmHelpers = {
      module,
      workerURL,
      async createEngine(model, options = {}) {
        const worker = new Worker(workerURL, { type: "module" });
        return await module.CreateWebWorkerMLCEngine(worker, model, options);
      },
      async chat(engine, request) {
        return await engine.chat.completions.create(request);
      },
      async stream(engine, request, callbacks = {}) {
        const stream = await engine.chat.completions.create({ ...request, stream: true });
        for await (const chunk of stream) {
          callbacks.onChunk?.(chunk);
        }
        return await engine.getMessage();
      }
    };
    console.log("WebLLM helpers initialised.");
  }
})();
""")


Next we wrap the helper in Python-friendly functions.
`load_engine` pulls a model into a dedicated worker, `ask` runs a standard completion, and `stream` demonstrates token streaming.

In [None]:
import js
from pyodide.ffi import create_proxy, to_js

progress_proxies = []

async def load_engine(model="SmolLM2-360M-Instruct-q0f16-MLC"):
    helpers = js.webllmHelpers

    def handle_progress(report):
        data = report.to_py()
        text = data.get("text", "")
        if text:
            print(text)

    progress_proxy = create_proxy(handle_progress)
    progress_proxies.append(progress_proxy)
    options = {"initProgressCallback": progress_proxy}
    engine = await helpers.createEngine(model, options)
    return engine

async def ask(engine, prompt):
    request = {
        "messages": [
            {
                "role": "system",
                "content": "You are a concise assistant running inside a JupyterLite notebook."
            },
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2,
        "max_tokens": 128
    }
    js_request = to_js(request, dict_converter=js.Object.fromEntries)
    response = await js.webllmHelpers.chat(engine, js_request)
    data = response.to_py()
    return data["choices"][0]["message"]["content"]

async def stream(engine, prompt):
    chunks = []

    def on_chunk(chunk):
        payload = chunk.to_py()
        delta = payload["choices"][0].get("delta", {})
        text = delta.get("content", "")
        if text:
            chunks.append(text)
            print(text, end="")

    chunk_proxy = create_proxy(on_chunk)
    request = {
        "messages": [
            {"role": "system", "content": "Stream tokens like a story narrator."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.6,
        "max_tokens": 128,
        "stream": True,
        "stream_options": {"include_usage": True}
    }
    js_request = to_js(request, dict_converter=js.Object.fromEntries)
    await js.webllmHelpers.stream(engine, js_request, {"onChunk": chunk_proxy})
    chunk_proxy.destroy()
    print()
    return "".join(chunks)


Load a small chat model. The first run will download model weights into the browser cache.

In [None]:
engine = await load_engine()
engine

Ask a question and display the assistant's reply.

In [None]:
await ask(engine, "Summarize why running language models directly in the browser is exciting.")

Streaming also works from Python:
TODO: Actually it doesn't, so fit it.

In [None]:
await stream(engine, "Compose a short haiku about notebooks that talk back.")

In [None]:
print(await ask(engine, "Compose a short haiku about notebooks that talk back."))