# llama.cpp Server on Google Colab

This notebook installs and runs `llama.cpp` (llama-server) on a Google Colab instance, stores the model on your Google Drive for persistence, and exposes a public endpoint using `ngrok`. It also provides optimization tips for Colab.

**Notes:** Use the GPU runtime if you intend to use GPU features or large models, but for GGUF Q4 models CPU-only instances work well in Colab. The notebook downloads a model from Hugging Face ‚Äî choose a quantized variant (Q4_K, Q3_K_S, Q2_K) depending on your speed/accuracy needs.

In [None]:
# Helper: Unmount Drive (useful to re-auth and switch accounts)
from google.colab import drive


def unmount_drive():
    """Unmount Google Drive and provide guidance to re-auth."""
    drive.flush_and_unmount()
    print(
        "Drive unmounted. Re-run the mount cell and sign in with the desired Google account."
    )


# Usage: run `unmount_drive()` then re-run the mount cell to authenticate as a different Google user


In [None]:
# 1) Mount Google Drive (persistent model storage)
from google.colab import drive
import os
import time

USE_DRIVE = True  # Set to False to skip Drive and use local storage
# Expected Google account to use for Drive (change if needed)
EXPECTED_DRIVE_EMAIL = "fuaadabdullah@gmail.com"

DRIVE_ROOT = None
DRIVE_MODEL_DIR = None


def mount_google_drive(max_retries=3):
    """Mount Google Drive with retry logic and set DRIVE_MODEL_DIR."""
    global DRIVE_ROOT, DRIVE_MODEL_DIR
    for attempt in range(max_retries):
        try:
            print(
                f"üîÑ Attempting to mount Google Drive (attempt {attempt + 1}/{max_retries})..."
            )
            drive.mount("/content/drive", force_remount=True)
            print("‚úÖ Google Drive mounted successfully!")

            # Detect canonical paths
            if os.path.exists("/content/drive/MyDrive"):
                DRIVE_ROOT = "/content/drive/MyDrive"
            elif os.path.exists("/content/drive/My Drive"):
                DRIVE_ROOT = "/content/drive/My Drive"
                # create /content/drive/MyDrive symlink for compatibility
                try:
                    if not os.path.exists("/content/drive/MyDrive"):
                        os.symlink("/content/drive/My Drive", "/content/drive/MyDrive")
                        print(
                            "üîó Created symlink /content/drive/MyDrive -> /content/drive/My Drive"
                        )
                except Exception as e:
                    print(f"‚ö†Ô∏è Could not create symlink: {e}")
            else:
                # mounted but expected subpaths missing
                DRIVE_ROOT = "/content/drive"
                print(
                    "‚ö†Ô∏è Drive mounted but MyDrive/My Drive not found under /content/drive"
                )

            # Set model dir
            DRIVE_MODEL_DIR = (
                os.path.join(DRIVE_ROOT, "llama_models")
                if DRIVE_ROOT
                else "/content/models"
            )
            os.makedirs(DRIVE_MODEL_DIR, exist_ok=True)
            print(f"üìÅ Using model directory: {DRIVE_MODEL_DIR}")

            # Verify signed-in email if possible
            try:
                from google.colab import auth

                auth.authenticate_user()
                from googleapiclient.discovery import build

                oauth2 = build("oauth2", "v2")
                userinfo = oauth2.userinfo().get().execute()
                signed_in_email = userinfo.get("email")
                print(f"--- Signed-in account: {signed_in_email}")
                if (
                    signed_in_email
                    and signed_in_email.lower() != EXPECTED_DRIVE_EMAIL.lower()
                ):
                    print("‚ö†Ô∏è Signed-in account does not match expected account")
                    print(f"    Expected: {EXPECTED_DRIVE_EMAIL}")
                    print(f"    Found:    {signed_in_email}")
                    print(
                        "To fix: run drive.flush_and_unmount() then re-run this cell and sign in with the correct account."
                    )
                    return False
            except Exception as e:
                print(f"‚ö†Ô∏è Could not verify Google account: {e}")

            return True
        except Exception as e:
            print(f"‚ùå Drive mount attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print("‚è≥ Waiting 3 seconds before retry...")
                time.sleep(3)
            else:
                print("\nüîß Troubleshooting steps:")
                print("1. Click the authentication link above and sign in")
                print(
                    "2. Use an incognito browser window and sign into the desired account"
                )
                print(
                    "3. If needed, run: drive.flush_and_unmount() then re-run this cell"
                )
                return False
    return False


if USE_DRIVE:
    drive_mounted = mount_google_drive()
    if not drive_mounted:
        print("\nüí° Falling back to local storage (/content/models/)")
        USE_DRIVE = False
        DRIVE_MODEL_DIR = "/content/models"
        os.makedirs(DRIVE_MODEL_DIR, exist_ok=True)
else:
    DRIVE_MODEL_DIR = "/content/models"
    os.makedirs(DRIVE_MODEL_DIR, exist_ok=True)
    print("‚ÑπÔ∏è  Using local storage for models (won't persist across sessions)")

print(f"FINAL: USE_DRIVE={USE_DRIVE}, DRIVE_MODEL_DIR={DRIVE_MODEL_DIR}")


In [None]:
# 2) Install system packages and clone/build llama.cpp with the server target
!apt-get update -y
!apt-get install -y build-essential cmake git wget unzip pwgen

# Clone and build llama.cpp
!git clone https://github.com/ggerganov/llama.cpp.git --depth 1
%cd llama.cpp

# Build with verbose output to see what's happening
!make -j$(nproc) 2>&1 | head -50

# Check if build was successful
!ls -la build/bin/ 2>/dev/null || echo "build/bin/ not found"
!ls -la llama-server 2>/dev/null || echo "llama-server not found in root"
!ls -la build/llama-server 2>/dev/null || echo "build/llama-server not found"

%cd ..
print("llama.cpp build completed - check output above for any errors")
print(
    "If build failed, you may need to install additional dependencies or check the logs"
)


In [None]:
# 2.5) Verify llama.cpp build and troubleshoot if needed
import os
import subprocess


def check_llama_build():
    """Check if llama.cpp was built successfully"""
    print("üîç Checking llama.cpp build status...")

    # Check for binary in various locations
    binary_locations = [
        "./llama.cpp/build/bin/llama-server",
        "./llama.cpp/bin/llama-server",
        "./llama.cpp/llama-server",
        "./llama.cpp/build/llama-server",
    ]

    found_binary = None
    for location in binary_locations:
        if os.path.exists(location):
            found_binary = location
            print(f"‚úÖ Found llama-server binary at: {location}")
            break

    if not found_binary:
        print("‚ùå llama-server binary not found!")
        print("\nüîß Troubleshooting steps:")

        # Check if we're in the right directory
        if not os.path.exists("./llama.cpp"):
            print("1. llama.cpp directory not found - cell 2 may have failed")
            print("   ‚Üí Re-run cell 2")
            return False

        # Check build directory
        if not os.path.exists("./llama.cpp/build"):
            print("2. Build directory doesn't exist - build may have failed")
            print("   ‚Üí Check cell 2 output for errors")
            print("   ‚Üí Try: cd llama.cpp && make clean && make -j$(nproc)")
            return False

        # Check for common build issues
        print("3. Checking for common build issues...")

        # Check if cmake is available
        try:
            result = subprocess.run(
                ["cmake", "--version"], capture_output=True, text=True, timeout=5
            )
            if result.returncode != 0:
                print("   - CMake may not be available")
        except:
            print("   - CMake may not be available")

        # Check if git clone was complete
        if not os.path.exists("./llama.cpp/Makefile"):
            print("   - Makefile not found - git clone may have failed")
            print(
                "   ‚Üí Re-run: !git clone https://github.com/ggerganov/llama.cpp.git --depth 1"
            )

        print("4. Try manual build:")
        print(
            "   !cd llama.cpp && make clean && make -j2  # Use fewer cores if memory issues"
        )

        return False

    # Test if binary is executable
    try:
        result = subprocess.run(
            [found_binary, "--version"], capture_output=True, text=True, timeout=10
        )
        if result.returncode == 0:
            print(f"‚úÖ Binary is executable: {result.stdout.strip()}")
            return True
        else:
            print(f"‚ùå Binary not executable: {result.stderr.strip()}")
            return False
    except Exception as e:
        print(f"‚ùå Error testing binary: {e}")
        return False


# Run the check
build_ok = check_llama_build()

if not build_ok:
    print("\n‚ö†Ô∏è  Build issues detected. Please fix before proceeding to cell 6.")
    print("Once fixed, re-run this cell to verify.")
else:
    print("\n‚úÖ Build looks good! Proceed to cell 6.")


In [None]:
# 3) Install ngrok and forwarding helper (for public testing)
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo 'deb https://ngrok-agent.s3.amazonaws.com buster main' | sudo tee /etc/apt/sources.list.d/ngrok.list
!apt-get update -y && apt-get install -y ngrok

# Set ngrok auth token
!ngrok config add-authtoken 367SkOQHlBFw8AG1TsVNI0L9y46_3WJdDXhwNNLSJ1nn8JzCB
print("ngrok installed and authenticated successfully!")


In [None]:
# 4) (Optional) Install huggingface-cli to download private models if needed
!pip install --upgrade pip huggingface-hub==0.20.3
print(
    "huggingface-hub installed; if you need private models, run `!huggingface-cli login` and paste your token."
)


In [None]:
# 5) Download the model into your Google Drive folder
# Example: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF (choose a quantization Q2/Q3/Q4 variant)
MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
dest_path = f"{DRIVE_MODEL_DIR}/{MODEL_FILENAME}"
print("Destination:", dest_path)

# Check if model already exists
if os.path.exists(dest_path):
    print(f"‚úÖ Model already exists at {dest_path}")
else:
    print(f"üì• Downloading model to {dest_path}")
    # Use huggingface-hub or wget to download the raw model.
    # If model is public, wget works; for private models use huggingface-cli with a token.
    !wget -O "{dest_path}" "https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}"
    print(f"‚úÖ Model downloaded to {dest_path}")

# Verify the model file
if os.path.exists(dest_path):
    size_mb = os.path.getsize(dest_path) / (1024 * 1024)
    print(f"‚úÖ Model verified: {MODEL_FILENAME} ({size_mb:.1f} MB)")
else:
    print(f"‚ùå Model download failed: {dest_path}")
    raise FileNotFoundError(f"Model not found at {dest_path}")


In [None]:
# 6) Start the llama.cpp server
import subprocess
import threading
import time
import os

SERVER_PORT = 8080
MODEL_PATH = dest_path


def find_llama_server():
    """Find the llama-server binary in possible locations"""
    possible_paths = [
        "./llama.cpp/build/bin/llama-server",
        "./llama.cpp/bin/llama-server",
        "./llama.cpp/llama-server",
        "./llama.cpp/build/llama-server",
    ]

    for path in possible_paths:
        if os.path.exists(path):
            print(f"‚úÖ Found llama-server at: {path}")
            return path

    # Check if it exists anywhere in the llama.cpp directory
    import glob

    llama_server_files = glob.glob("./llama.cpp/**/llama-server", recursive=True)
    if llama_server_files:
        path = llama_server_files[0]
        print(f"‚úÖ Found llama-server at: {path}")
        return path

    print("‚ùå llama-server binary not found in expected locations")
    print("Checking build directory contents...")

    # Debug: show what's in the build directory
    if os.path.exists("./llama.cpp/build"):
        print("Contents of ./llama.cpp/build/:")
        try:
            result = subprocess.run(
                ["ls", "-la", "./llama.cpp/build/"], capture_output=True, text=True
            )
            print(result.stdout)
        except:
            print("Could not list build directory")

    if os.path.exists("./llama.cpp/build/bin"):
        print("Contents of ./llama.cpp/build/bin/:")
        try:
            result = subprocess.run(
                ["ls", "-la", "./llama.cpp/build/bin/"], capture_output=True, text=True
            )
            print(result.stdout)
        except:
            print("Could not list build/bin directory")

    print(
        "\nüí° Build may have failed. Try running cell 2 again or check the build output above."
    )
    return None


def start_llama_server():
    """Start llama.cpp server in background"""

    # Find the binary first
    server_binary = find_llama_server()
    if not server_binary:
        return None

    cmd = [
        server_binary,
        "--model",
        MODEL_PATH,
        "--host",
        "127.0.0.1",
        "--port",
        str(SERVER_PORT),
        "--threads",
        "4",  # Adjust based on Colab CPU cores
        "--ctx-size",
        "2048",  # Context window
        "--n-gpu-layers",
        "0",  # Use 0 for CPU-only, or higher for GPU
        "--api-key",
        "",  # No API key for local access
    ]

    print(f"Starting llama.cpp server on port {SERVER_PORT}...")
    print(f"Command: {' '.join(cmd)}")

    # Start server in background
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Wait a bit for server to start
    time.sleep(5)

    # Check if server is running
    if process.poll() is None:
        print(f"‚úÖ Server started successfully on http://127.0.0.1:{SERVER_PORT}")
        return process
    else:
        stdout, stderr = process.communicate()
        print(f"‚ùå Server failed to start")
        print(f"STDOUT: {stdout.decode()}")
        print(f"STDERR: {stderr.decode()}")
        return None


# Start the server
server_process = start_llama_server()

if server_process is None:
    raise RuntimeError(
        "Failed to start llama.cpp server - check build output in cell 2"
    )


In [None]:
# 7) Set up ngrok tunnel for public access
import subprocess
import time
import requests


def setup_ngrok_tunnel(port=SERVER_PORT):
    """Set up ngrok tunnel to expose the local server"""

    # Check if ngrok is installed
    try:
        result = subprocess.run(["ngrok", "version"], capture_output=True, text=True)
        if result.returncode != 0:
            print("‚ùå ngrok not found. Please run cell 3 first to install ngrok.")
            return None
    except FileNotFoundError:
        print("‚ùå ngrok not found. Please run cell 3 first to install ngrok.")
        return None

    print("Starting ngrok tunnel...")

    # Start ngrok in background
    ngrok_cmd = ["ngrok", "http", str(port)]
    ngrok_process = subprocess.Popen(
        ngrok_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )

    # Wait for ngrok to start
    time.sleep(3)

    # Get the public URL
    try:
        response = requests.get("http://127.0.0.1:4040/api/tunnels", timeout=5)
        if response.status_code == 200:
            tunnels = response.json()["tunnels"]
            if tunnels:
                public_url = tunnels[0]["public_url"]
                print(f"‚úÖ ngrok tunnel active: {public_url}")
                print(f"üîó Use this URL in your Goblin Assistant configuration")
                return public_url
            else:
                print("‚ùå No tunnels found")
        else:
            print(f"‚ùå Failed to get tunnel info: {response.status_code}")
    except Exception as e:
        print(f"‚ùå Error getting ngrok URL: {e}")
        print("üí° Make sure ngrok is running. Check the output above.")

    return None


# Set up the tunnel
ngrok_url = setup_ngrok_tunnel(SERVER_PORT)

if ngrok_url:
    print("\n" + "=" * 50)
    print("üéâ INTEGRATION COMPLETE!")
    print("=" * 50)
    print(f"Public endpoint: {ngrok_url}")
    print("\nüìã Next steps:")
    print("1. Copy the ngrok URL above")
    print("2. Run this command on your local machine:")
    print(
        f"   python3 setup_colab_integration.py --provider llamacpp --colab-url {ngrok_url} --auto-test"
    )
    print("3. Test the integration:")
    print(
        "   python3 scripts/test_goblin_colab_integration.py --backend-url http://localhost:8000"
    )
else:
    print("\n‚ùå Failed to set up ngrok tunnel")
    print("üí° Troubleshooting:")
    print("1. Make sure you have an ngrok account and auth token")
    print("2. Run: !ngrok config add-authtoken YOUR_TOKEN")
    print("3. Restart this cell")


In [None]:
# 8) Quick perf benchmark: run several prompts and measure latency/tokens/sec
import time
import requests

url = "http://127.0.0.1:8080/completions"
prompt = "Benchmark: Provide a short helpful reply."
N = 5
times = []
for i in range(N):
    payload = {"prompt": prompt, "max_tokens": 64}
    t0 = time.time()
    r = requests.post(url, json=payload, timeout=30)
    dt = time.time() - t0
    times.append(dt)
    print(f"Run {i + 1} status={r.status_code} elapsed={dt:.2f}s")

print("Average elapsed", sum(times) / len(times))
print(
    "Tip: Tune --threads, try smaller quantizations (Q2/Q3), or increase --cache-ram to reduce latency."
)


# Quick re-run steps

1. Run the **Mount Google Drive** cell above.
2. If the signed-in account isn't the desired one, run the `unmount_drive()` helper then re-run the mount cell.
3. Run the **Diagnostic: Drive & account verification** cell and paste the output here.