<a href="https://colab.research.google.com/github/giuseppefutia/cdl2025/blob/master/vLLM_%2B_Gemma3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation, Importing, Configuration

In [None]:
%%capture
!pip install -qq fastapi uvicorn
!pip install -qq vllm
!pip install -qq pyngrok

In [None]:
from __future__ import annotations

# Standard library
import os
import socket
import subprocess
import sys
import textwrap
import time
from contextlib import contextmanager
from dataclasses import dataclass
from getpass import getpass
from typing import Iterable, Generator, Optional, Sequence

# Third-party
import torch
from pyngrok import ngrok
from fastapi import FastAPI
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from openai import OpenAI
from huggingface_hub import login
from google.colab import userdata
import uvicorn

In [None]:
# You need to store your keys into Secret section
HF_TOKEN = userdata.get("HF_TOKEN")
NGROK_TOKEN = userdata.get("NGROK_TOKEN")

os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["NGROK_TOKEN"] = NGROK_TOKEN

login(HF_TOKEN)

In [None]:
!ngrok config add-authtoken "$NGROK_TOKEN"

In [None]:
MODEL = 'google/medgemma-4b-it' # @param {type:'string'}
VLLM_HOST = '0.0.0.0' # @param {type:'string'}
VLLM_PORT = 8000 # @param {type:'integer'}
API_HOST = '127.0.0.1' # @param {type:'string'}
API_PORT = 8001 # @param {type:'integer'}
MAX_MODEL_LEN = 131072 # @param {type:'integer'}
# MAX_MODEL_LEN = 8192
TENSOR_PARALLEL_SIZE = 1 # @param {type:'integer'}

# Model Deployment with vLLM

In [None]:
# ----------------------------
# Config (simple, safe defaults for Colab)
# ----------------------------
from dataclasses import dataclass
from typing import Optional, Sequence, Generator, Tuple, List
import subprocess, time, socket, torch
from contextlib import contextmanager
from threading import Thread

@dataclass
class VLLMConfig:
    model: str = MODEL
    host: str = VLLM_HOST
    port: int = VLLM_PORT
    max_model_len: int = MAX_MODEL_LEN
    tensor_parallel_size: int = TENSOR_PARALLEL_SIZE
    trust_remote_code: bool = True
    download_dir: Optional[str] = None

    @property
    def base_url(self) -> str:
        return f"http://{self.host}:{self.port}"


# ----------------------------
# Small helpers
# ----------------------------

def select_dtype() -> str:
    if torch.cuda.is_available():
        major, _ = torch.cuda.get_device_capability(0)
        return "bfloat16" if major >= 8 else "float16"
    return "float32"

def wait_for_port(host: str, port: int, timeout: float = 120.0, interval: float = 0.5) -> bool:
    """Very small readiness check without touching logs/files."""
    deadline = time.time() + timeout
    while time.time() < deadline:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.settimeout(1.0)
            if s.connect_ex((host, port)) == 0:
                return True
        time.sleep(interval)
    return False

def build_args(cfg: VLLMConfig, dtype: str) -> Sequence[str]:
    args = [
        "vllm", "serve", cfg.model,
        "--dtype", dtype,
        "--max-model-len", str(cfg.max_model_len),
        "--tensor-parallel-size", str(cfg.tensor_parallel_size),
        "--host", cfg.host,
        "--port", str(cfg.port),
    ]
    if cfg.trust_remote_code:
        args.append("--trust-remote-code")
    if cfg.download_dir:
        args += ["--download-dir", cfg.download_dir]
    return args

# ----------------------------
# Log streaming helpers
# ----------------------------

def _pump(pipe, tag: str, sink: Optional[List[str]] = None):
    """Read a subprocess pipe line-by-line and stream it to stdout (and optional sink)."""
    try:
        for line in iter(pipe.readline, ""):
            if not line:
                break
            print(f"[vLLM {tag}] {line}", end="")   # stream to cell output
            if sink is not None:
                sink.append(f"[{tag}] {line}")
    except Exception as e:
        print(f"[vLLM LOG ERROR {tag}] {e}")
    finally:
        try:
            pipe.close()
        except Exception:
            pass

class VLLMProcess:
    """Wrapper for the vLLM subprocess that also carries in-memory logs."""
    def __init__(self, proc: subprocess.Popen):
        self.proc = proc
        self.logs: List[str] = []
        self._t_out = Thread(target=_pump, args=(proc.stdout, "OUT", self.logs), daemon=True)
        self._t_err = Thread(target=_pump, args=(proc.stderr, "ERR", self.logs), daemon=True)
        self._t_out.start()
        self._t_err.start()

    @property
    def pid(self) -> int:
        return self.proc.pid

    def wait(self, timeout: Optional[float] = None) -> int:
        return self.proc.wait(timeout=timeout)

# ----------------------------
# Start/stop + context manager
# ----------------------------

def start_vllm(cfg: VLLMConfig) -> VLLMProcess:
    dtype = select_dtype()
    args = build_args(cfg, dtype)
    proc = subprocess.Popen(
        args,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        encoding="utf-8",
        errors="replace",
        start_new_session=True,
        bufsize=1,          # line-buffered for timely streaming
    )
    return VLLMProcess(proc)

def stop_vllm(p: Optional[VLLMProcess]) -> None:
    if not p:
        return
    proc = p.proc
    try:
        proc.terminate()
        try:
            proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            proc.kill()
    except Exception:
        pass

@contextmanager
def run_vllm(cfg: VLLMConfig) -> Generator[tuple[str, subprocess.Popen], None, None]:
    """
    Minimal, Colab-safe: start vLLM, wait until reachable, yield (base_url, proc), then clean up.
    No file logging, no extra dependencies.
    """
    proc = start_vllm(cfg)
    try:
        if not wait_for_port(cfg.host, cfg.port, timeout=10000):
            stop_vllm(proc)
            raise RuntimeError("vLLM didn't become ready on time. (Port check failed)")
        print(f"vLLM up at {cfg.base_url} (pid={proc.pid})")
        yield (cfg.base_url, proc)
    finally:
        stop_vllm(proc)

In [None]:
# Test
cfg = VLLMConfig(model=MODEL, port=VLLM_PORT)
with run_vllm(cfg) as (base_url, proc):
    client = OpenAI(
        api_key="EMPTY",                      # vLLM ignores auth by default
        base_url="http://localhost:8000/v1",  # your vLLM endpoint
    )
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "user", "content": "Who are you?"}
        ],
        temperature=0,
    )
    print("Completion result:", resp.choices[0].message.content)

[vLLM OUT] INFO 11-16 15:46:22 [__init__.py:216] Automatically detected platform cuda.
[vLLM ERR] 2025-11-16 15:46:24.405135: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[vLLM ERR] 2025-11-16 15:46:24.422473: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[vLLM ERR] E0000 00:00:1763307984.444106    2419 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[vLLM ERR] E0000 00:00:1763307984.450608    2419 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[vLLM ERR] W

In [None]:
# Model for OpenAI client
cfg = VLLMConfig()
proc = start_vllm(cfg)
try:
    assert wait_for_port(cfg.host, cfg.port, timeout=300), "vLLM did not open the port"
    print(f"vLLM up at {cfg.base_url}")

    HOSTNAME = "chatcompletion-uncognizable-nilda.ngrok-free.dev"
    public_url = ngrok.connect(cfg.port, "http", domain=HOSTNAME).public_url
    public_v1 = public_url.rstrip("/") + "/v1"
    print("Public endpoint:", public_v1)

    # Quick in-notebook sanity check
    import requests
    r = requests.get(cfg.base_url + "/v1/models", timeout=10)
    print("Local /v1/models:", r.status_code)

    print("Serving... (leave this cell running)")
    while True:
        pass

finally:
    try: ngrok.kill()
    except Exception: pass
    stop_vllm(proc)

# Model Functions

In [None]:
def make_openai_client(base_url: str, api_key: str = "EMPTY") -> OpenAI:
    """
    vLLM exposes an OpenAI-compatible API. Unless you set --api-key in vLLM,
    any token is accepted; 'EMPTY' is a common no-op.
    """
    return OpenAI(api_key=api_key, base_url=base_url)


def ask_model_sync(
    client: OpenAI,
    model: str,
    messages: Iterable[dict],
    **kwargs,
) -> str:
    """
    Blocking call that returns the assistant's final message content.
    kwargs (optional): temperature, max_tokens, etc.
    """
    resp = client.chat.completions.create(
        model=model,
        messages=list(messages),
        **kwargs,
    )
    # vLLM returns OpenAI-style choices
    return resp.choices[0].message.content or ""


def stream_model(
    client: OpenAI,
    model: str,
    messages: Iterable[dict],
    **kwargs,
) -> Generator[str, None, None]:
    """
    Streaming generator that yields tokens (strings). You can wrap this in
    FastAPI's StreamingResponse for SSE-style delivery.
    """
    stream = client.chat.completions.create(
        model=model,
        messages=list(messages),
        stream=True,
        **kwargs,
    )
    for event in stream:
        # OpenAI SDK stream returns chunks with delta content
        delta = getattr(getattr(event, "choices", [None])[0], "delta", None)
        if delta and delta.content:
            yield delta.content
    # Signal end if you’re doing SSE
    # yield "[DONE]"

In [None]:
cfg = VLLMConfig(model=MODEL, port=VLLM_PORT)
with run_vllm(cfg) as (base_url, proc):

  # Test
  answer = ask_model_sync(
            client,
            model=cfg.model,
            messages=[{"role": "user", "content": "Who are you? Are you ready?"}],
            temperature=0,
            max_tokens=128,
        )
  print("Answer:", answer)

  # Test streaming
  print("\nStreaming:")
  for tok in stream_model(
      client,
      model=cfg.model,
      messages=[{"role": "user", "content": "Who are you? Give me details. Are you ready?"}],
      temperature=0,
  ):
      print(tok, end="", flush=True)
  print("\n[stream done]")

vLLM up at http://0.0.0.0:8000 (pid=2532)
Answer: I am Gemma, an open-weights AI assistant. I am a large language model trained by Google DeepMind. I am ready to assist you.


Streaming:
I am Gemma, an open-weights AI assistant. I am a large language model trained by Google DeepMind. I am available as an open-weights model, which means my underlying code and parameters are accessible to the public. I am ready to assist you.

[stream done]


# FastAPI Service

In [None]:
class QuestionRequest(BaseModel):
    question: str

def build_app(client: OpenAI, model_name: str) -> FastAPI:
    app = FastAPI(title="vLLM (OpenAI SDK) proxy")
    app.add_middleware(
      CORSMiddleware,
      allow_origins=["*"],  # tighten for production
      allow_credentials=True,
      allow_methods=["*"],
      allow_headers=["*"],
  )

    @app.post("/ask")
    def ask(req: QuestionRequest):
        answer = ask_model_sync(
            client,
            model=model_name,
            messages=[{"role": "user", "content": req.question}],
            temperature=0,
        )
        return JSONResponse({"answer": answer})

    @app.post("/stream")
    def stream(req: QuestionRequest):
        gen = stream_model(
            client,
            model=model_name,
            messages=[{"role": "user", "content": req.question}],
            temperature=0,
        )

        # SSE-like stream: prefix lines with "data: " if you need strict SSE
        def sse():
            for chunk in gen:
                yield f"data: {chunk}\n\n"
            yield "data: [DONE]\n\n"

        return StreamingResponse(sse(), media_type="text/event-stream")

    return app


In [None]:
cfg = VLLMConfig(model=MODEL, port=VLLM_PORT)
with run_vllm(cfg) as (base_url, proc):
  public_url = ngrok.connect(API_PORT).public_url
  print(f" * ngrok tunnel \"{public_url}\" -> \"{API_HOST}:{API_PORT}\"")
  app = build_app(client, cfg.model)
  config = uvicorn.Config(app, host=API_HOST, port=API_PORT, log_level="info")
  server = uvicorn.Server(config)

  await server.serve()

vLLM up at http://0.0.0.0:8000 (pid=3155)


INFO:     Started server process [324]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8001 (Press CTRL+C to quit)


 * ngrok tunnel "https://b65cb04e9d05.ngrok.app" -> "127.0.0.1:8001"
INFO:     2a01:e11:5401:8500:2587:3369:aa11:7af8:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     2a01:e11:5401:8500:2587:3369:aa11:7af8:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     2a01:e11:5401:8500:2587:3369:aa11:7af8:0 - "POST /ask HTTP/1.1" 200 OK
