# llama.cpp Server on Google Colab

This notebook installs and runs `llama.cpp` (llama-server) on a Google Colab instance, stores the model on your Google Drive for persistence, and exposes a public endpoint using `ngrok`. It also provides optimization tips for Colab.

**Notes:** Use the GPU runtime if you intend to use GPU features or large models, but for GGUF Q4 models CPU-only instances work well in Colab. The notebook downloads a model from Hugging Face ‚Äî choose a quantized variant (Q4_K, Q3_K_S, Q2_K) depending on your speed/accuracy needs.

In [None]:
# 1) Mount Google Drive (persistent model storage)
from google.colab import drive
import os

USE_DRIVE = True  # Set to False to skip Drive and use local storage

if USE_DRIVE:
    try:
        # Try to mount with force_remount to handle common issues
        drive.mount("/content/drive", force_remount=True)
        print("‚úÖ Google Drive mounted successfully!")
    except ValueError as e:
        print(f"‚ùå Drive mount failed: {e}")
        print("\\nüîß Troubleshooting steps:")
        print("1. Click the authentication link above")
        print("2. Sign in with your Google account")
        print("3. Click 'Allow' to grant access")
        print("4. Copy the authorization code back here")
        print("5. If it still fails, try:")
        print("   - Restart the runtime (Runtime ‚Üí Restart runtime)")
        print("   - Clear browser cache and try again")
        print("   - Use a different Google account")
        print("\\nüí° Falling back to local storage (/content/models/)")
        USE_DRIVE = False

# Set up model directory
if USE_DRIVE:
    DRIVE_MODEL_DIR = "/content/drive/MyDrive/llama_models"
else:
    DRIVE_MODEL_DIR = "/content/models"

os.makedirs(DRIVE_MODEL_DIR, exist_ok=True)
print(f"Model directory ready: {DRIVE_MODEL_DIR}")

if not USE_DRIVE:
    print("‚ö†Ô∏è  Note: Models will be lost when the Colab session ends")
    print("   To persist models, fix the Drive mount issue above")


: 

In [None]:
# 2) Install system packages and clone/build llama.cpp with the server target
!apt-get update -y
!apt-get install -y build-essential cmake git wget unzip pwgen

# Build llama.cpp server
!git clone https://github.com/ggerganov/llama.cpp.git --depth 1
%cd llama.cpp
!make -j$(nproc)
%cd ..
print(
    "llama.cpp built (server binary available in llama.cpp/build or llama.cpp/bin depending on build)"
)


In [None]:
# 3) Install ngrok and forwarding helper (for public testing)
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo 'deb https://ngrok-agent.s3.amazonaws.com buster main' | sudo tee /etc/apt/sources.list.d/ngrok.list
!apt-get update -y && apt-get install -y ngrok

# If you have an ngrok auth token, set it via: ngrok config add-authtoken <YOUR-NGROK-AUTHTOKEN>
print(
    "ngrok installed; run `!ngrok config add-authtoken <YOUR_TOKEN>` if you have one."
)


In [None]:
# 4) (Optional) Install huggingface-cli to download private models if needed
%pip install --upgrade pip huggingface-hub
print(
    "huggingface-hub installed; if you need private models, run `!huggingface-cli login` and paste your token."
)


In [None]:
# 5) Download the model into your Google Drive folder
# Example: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF (choose a quantization Q2/Q3/Q4 variant)
MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
dest_path = f"{DRIVE_MODEL_DIR}/{MODEL_FILENAME}"
print("Destination:", dest_path)

# Check if model already exists
if os.path.exists(dest_path):
    print(f"‚úÖ Model already exists at {dest_path}")
else:
    print(f"üì• Downloading model to {dest_path}")
    # Use huggingface-hub or wget to download the raw model.
    # If model is public, wget works; for private models use huggingface-cli with a token.
    !wget -O "{dest_path}" "https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}"
    print(f"‚úÖ Model downloaded to {dest_path}")

# 6) Start the llama.cpp server
import subprocess
import threading
import time

SERVER_PORT = 8080
MODEL_PATH = dest_path

def start_llama_server():
    """Start llama.cpp server in background"""
    cmd = [
        "./llama.cpp/build/bin/llama-server",
        "--model", MODEL_PATH,
        "--host", "127.0.0.1",
        "--port", str(SERVER_PORT),
        "--threads", "4",  # Adjust based on Colab CPU cores
        "--ctx-size", "2048",  # Context window
        "--n-gpu-layers", "0",  # Use 0 for CPU-only, or higher for GPU
        "--api-key", "",  # No API key for local access
    ]

    print(f"Starting llama.cpp server on port {SERVER_PORT}...")
    print(f"Command: {' '.join(cmd)}")

    # Start server in background
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Wait a bit for server to start
    time.sleep(5)

    # Check if server is running
    if process.poll() is None:
        print(f"‚úÖ Server started successfully on http://127.0.0.1:{SERVER_PORT}")
        return process
    else:
        stdout, stderr = process.communicate()
        print(f"‚ùå Server failed to start")
        print(f"STDOUT: {stdout.decode()}")
        print(f"STDERR: {stderr.decode()}")
        return None

# Start the server
server_process = start_llama_server()

if server_process is None:
    raise RuntimeError("Failed to start llama.cpp server")
<VSCode.Cell id="#VSC-e8941fff" language="python">
# 9) Quick perf benchmark: run several prompts and measure latency/tokens/sec
import time, requests

url = "http://127.0.0.1:8080/completions"
prompt = "Benchmark: Provide a short helpful reply."
N = 5
times = []
for i in range(N):
    payload = {"prompt": prompt, "max_tokens": 64}
    t0 = time.time()
    r = requests.post(url, json=payload, timeout=30)
    dt = time.time() - t0
    times.append(dt)
    print(f"Run {i + 1} status={r.status_code} elapsed={dt:.2f}s")

print("Average elapsed", sum(times) / len(times))
print(
    "Tip: Tune --threads, try smaller quantizations (Q2/Q3), or increase --cache-ram to reduce latency."
)


In [None]:
# 7) Setup ngrok tunnel for public access
%pip install pyngrok

from pyngrok import ngrok
import time

# Kill any existing tunnels
ngrok.kill()

# Start tunnel to llama.cpp server
tunnel = ngrok.connect(8080, "http")
public_url = tunnel.public_url

print(f"üåê Public URL: {public_url}")
print("üìù Copy this URL for your Goblin Assistant config")

# Keep tunnel alive
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Shutting down tunnel...")
    ngrok.kill()


In [None]:
# 11) Generate Shareable Colab Link
import urllib.parse
import json
import base64


def generate_colab_link(ngrok_token="", server_port=8080, model_repo=MODEL_REPO):
    """Generate a shareable Colab link with embedded configuration"""

    # Get current notebook URL (this will be replaced when uploaded to Colab)
    notebook_url = "https://colab.research.google.com/drive/YOUR_NOTEBOOK_ID"  # Replace with actual URL

    # Create configuration
    config = {
        "ngrok_token": ngrok_token,
        "server_port": server_port,
        "model_repo": model_repo,
        "auto_start": True,
    }

    # Encode config
    config_json = json.dumps(config, separators=(",", ":"))
    config_b64 = base64.urlsafe_b64encode(config_json.encode()).decode()

    # Build shareable URL
    shareable_url = f"{notebook_url}?config={config_b64}"

    print("üîó Shareable Colab Link:")
    print(shareable_url)
    print("\\nüìã Instructions:")
    print(
        "1. Replace YOUR_NOTEBOOK_ID in the URL above with this notebook's actual Drive ID"
    )
    print("2. Share this link with others")
    print("3. Recipients can open it directly in Colab with pre-configured settings")

    if ngrok_token:
        print("\\n‚úÖ ngrok token included - tunnel will be created automatically")
    else:
        print("\\n‚ö†Ô∏è  No ngrok token - users will need to add their own")

    return shareable_url


# Generate link (add your ngrok token if you have one)
YOUR_NGROK_TOKEN = ""  # Add your token here: ngrok config add-authtoken YOUR_TOKEN
shareable_link = generate_colab_link(
    ngrok_token=YOUR_NGROK_TOKEN, server_port=8080, model_repo=MODEL_REPO
)
