<a href="https://colab.research.google.com/github/fabiopauli/DDL/blob/main/Qwen3_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üöÄ Qwen3.5-27B API Server (llama.cpp + FastAPI + ngrok)

Run **Qwen3.5-27B** (Dynamic 4-bit GGUF) on a **Google Colab L4 GPU** and expose it
as an OpenAI-compatible API via **ngrok**.

---

## ‚öôÔ∏è Prerequisites ‚Äî Colab Secrets

Before running, add **two secrets** in the Colab sidebar (üîë icon ‚Üí "Secrets"):

| Secret Name     | Where to get it                                | What it does                          |
|-----------------|------------------------------------------------|---------------------------------------|
| `NGROK_TOKEN`   | https://dashboard.ngrok.com/get-started/your-authtoken | Creates a public tunnel to your API  |
| `HF_TOKEN`      | https://huggingface.co/settings/tokens         | Downloads gated/private models (optional for this model, but good practice) |

> **Tip:** Toggle "Notebook access" ON for each secret after adding it.

---

## üìã How to use

1. Select **Runtime ‚Üí Change runtime type ‚Üí L4 GPU**
2. Add your secrets (see above)
3. Run **Cell 1** ‚Äî builds llama.cpp and downloads the model (~16 GB, takes ~5 min)
4. Run **Cell 2** ‚Äî starts the FastAPI + ngrok server
5. Copy the **ngrok URL** printed in the output
6. Send requests to `<ngrok_url>/v1/chat/completions` (OpenAI-compatible)

### Example request (Python)
```python
import requests

URL = "https://<your-ngrok-url>/v1/chat/completions"

response = requests.post(URL, json={
    "messages": [{"role": "user", "content": "Explain transformers in 3 sentences."}],
    "temperature": 0.7,
    "max_tokens": 512
})
print(response.json()["choices"][0]["message"]["content"])
```

### Example request (curl)
```bash
curl <ngrok_url>/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"messages":[{"role":"user","content":"Hello!"}],"temperature":0.7}'
```

---

## üìù Notes
- **Model:** `unsloth/Qwen3.5-27B-GGUF` (UD-Q4_K_XL ‚Äî Dynamic 4-bit)
- **Context:** 16,384 tokens (model supports up to 256K, but VRAM-limited on L4)
- **Mode:** Non-thinking (no `<think>` tags ‚Äî direct responses only)
- **VRAM:** ~18-20 GB ‚Äî fits tightly on L4 (22.5 GB)
"""


In [None]:
# =============================================================================
# CELL 1 ‚Äî Build llama.cpp + Download Model
# =============================================================================

# --- Build llama.cpp with CUDA ---
!apt-get update -qq && apt-get install -qq -y pciutils build-essential cmake curl libcurl4-openssl-dev > /dev/null 2>&1
!git clone --depth 1 https://github.com/ggml-org/llama.cpp 2>/dev/null || echo "Already cloned"

!cmake llama.cpp -B llama.cpp/build \
    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON > /dev/null 2>&1

!cmake --build llama.cpp/build --config Release -j$(nproc) --clean-first \
    --target llama-server 2>&1 | tail -3

!cp llama.cpp/build/bin/llama-* llama.cpp/

# --- Download the GGUF model ---
!pip install -q huggingface_hub hf_transfer

import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Use wget for reliability (huggingface-cli can hang in Colab)
!mkdir -p unsloth/Qwen3.5-27B-GGUF
!wget -c -q --show-progress \
    "https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/Qwen3.5-27B-UD-Q4_K_XL.gguf" \
    -O unsloth/Qwen3.5-27B-GGUF/Qwen3.5-27B-UD-Q4_K_XL.gguf

print("\n‚úÖ Build and download complete!")


In [None]:
# =============================================================================
# CELL 2 ‚Äî Start llama-server + FastAPI proxy + ngrok tunnel (non-blocking)
# =============================================================================

!pip install -q fastapi uvicorn pyngrok httpx

import subprocess, time, threading, json, os
from google.colab import userdata
from pyngrok import ngrok
import httpx
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse

# ‚îÄ‚îÄ Config ‚îÄ‚îÄ
NGROK_AUTH_TOKEN = userdata.get("NGROK_TOKEN")
MODEL_PATH = "unsloth/Qwen3.5-27B-GGUF/Qwen3.5-27B-UD-Q4_K_XL.gguf"
LLAMA_PORT = 8081
API_PORT = 8080
CTX_SIZE = 16384

# ‚îÄ‚îÄ 1. Start llama-server in background ‚îÄ‚îÄ
llama_cmd = [
    "./llama.cpp/llama-server",
    "--model", MODEL_PATH,
    "--ctx-size", str(CTX_SIZE),
    "--n-gpu-layers", "99",
    "--port", str(LLAMA_PORT),
    "--host", "0.0.0.0",
    "--temp", "0.7",
    "--top-p", "0.8",
    "--top-k", "20",
    "--min-p", "0.0",
    "--chat-template-kwargs", '{"enable_thinking": false}',
]

print("üîÑ Starting llama-server...")
llama_proc = subprocess.Popen(
    llama_cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
)

def stream_logs(proc):
    for line in iter(proc.stdout.readline, b""):
        print(f"  [llama] {line.decode().rstrip()}")

log_thread = threading.Thread(target=stream_logs, args=(llama_proc,), daemon=True)
log_thread.start()

# Wait for llama-server to be ready
LLAMA_BASE = f"http://127.0.0.1:{LLAMA_PORT}"
for i in range(120):
    try:
        r = httpx.get(f"{LLAMA_BASE}/health", timeout=2)
        if r.status_code == 200:
            print("‚úÖ llama-server is ready!")
            break
    except:
        pass
    time.sleep(1)
else:
    print("‚ùå llama-server failed to start. Check logs above.")

# ‚îÄ‚îÄ 2. FastAPI proxy ‚îÄ‚îÄ
app = FastAPI(title="Qwen3.5-27B API")

@app.get("/health")
async def health():
    return {"status": "ok", "model": "Qwen3.5-27B-UD-Q4_K_XL"}

@app.get("/v1/models")
async def models():
    return {
        "object": "list",
        "data": [{"id": "qwen3.5-27b", "object": "model", "owned_by": "unsloth"}]
    }

@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    body = await request.json()
    payload = {
        "messages": body.get("messages", []),
        "temperature": body.get("temperature", 0.7),
        "top_p": body.get("top_p", 0.8),
        "top_k": body.get("top_k", 20),
        "max_tokens": body.get("max_tokens", 2048),
        "stream": body.get("stream", False),
    }
    if payload["stream"]:
        async def event_stream():
            async with httpx.AsyncClient() as client:
                async with client.stream(
                    "POST", f"{LLAMA_BASE}/v1/chat/completions",
                    json=payload, timeout=300,
                ) as resp:
                    async for chunk in resp.aiter_bytes():
                        yield chunk
        return StreamingResponse(event_stream(), media_type="text/event-stream")
    else:
        async with httpx.AsyncClient() as client:
            resp = await client.post(
                f"{LLAMA_BASE}/v1/chat/completions",
                json=payload, timeout=300,
            )
            return JSONResponse(content=resp.json())

# ‚îÄ‚îÄ 3. Start ngrok tunnel ‚îÄ‚îÄ
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
tunnel = ngrok.connect(API_PORT)
public_url = tunnel.public_url

# ‚îÄ‚îÄ 4. Run FastAPI in background thread ‚îÄ‚îÄ
server_thread = threading.Thread(
    target=uvicorn.run,
    args=(app,),
    kwargs={"host": "0.0.0.0", "port": API_PORT, "log_level": "warning"},
    daemon=True,
)
server_thread.start()
time.sleep(1)

# Verify it's up
try:
    r = httpx.get(f"http://127.0.0.1:{API_PORT}/health", timeout=5)
    assert r.status_code == 200
    api_ok = "‚úÖ"
except:
    api_ok = "‚ö†Ô∏è  (may need a moment)"

print("\n" + "=" * 60)
print(f"üåê PUBLIC API URL: {public_url}")
print(f"üè† LOCAL API URL:  http://127.0.0.1:{API_PORT}")
print(f"   FastAPI: {api_ok}  |  llama-server: ‚úÖ")
print(f"=" * 60)
print(f"\n  POST {public_url}/v1/chat/completions")
print(f"\n‚úÖ Server is running in background ‚Äî proceed to Cell 3!")

In [None]:
# =============================================================================
# CELL 3 ‚Äî Interactive Chat (run after Cell 2 is running in another tab/cell)
# =============================================================================


import requests, json

# ‚îÄ‚îÄ Config ‚îÄ‚îÄ
# Replace with your ngrok URL from Cell 2 output, OR use localhost if same notebook with threaded server
API_URL = "http://127.0.0.1:8080/v1/chat/completions"  # change to ngrok URL if external
# API_URL = "https://xxxx-xx-xx.ngrok-free.app/v1/chat/completions"

SYSTEM_PROMPT = "You are a helpful assistant. Respond concisely and clearly."

history = [{"role": "system", "content": SYSTEM_PROMPT}]

def chat(user_msg, temperature=0.7, max_tokens=2048):
    history.append({"role": "user", "content": user_msg})
    try:
        resp = requests.post(API_URL, json={
            "messages": history,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }, timeout=300)
        resp.raise_for_status()
        assistant_msg = resp.json()["choices"][0]["message"]["content"]
        history.append({"role": "assistant", "content": assistant_msg})
        return assistant_msg
    except Exception as e:
        history.pop()  # remove failed user msg
        return f"‚ùå Error: {e}"

# ‚îÄ‚îÄ Interactive loop ‚îÄ‚îÄ
print("=" * 60)
print("üí¨ Qwen3.5-27B Chat ‚Äî type 'quit' to exit, 'clear' to reset")
print("=" * 60)

while True:
    try:
        user_input = input("\nüë§ You: ").strip()
    except (KeyboardInterrupt, EOFError):
        print("\nüëã Bye!")
        break

    if not user_input:
        continue
    if user_input.lower() == "quit":
        print("üëã Bye!")
        break
    if user_input.lower() == "clear":
        history = [{"role": "system", "content": SYSTEM_PROMPT}]
        print("üóëÔ∏è  History cleared.")
        continue

    print("\nü§ñ Qwen: ", end="", flush=True)
    reply = chat(user_input)
    print(reply)