<a href="https://colab.research.google.com/github/davidj4tech/sacred-brain/blob/main/llama_cpp_python_ngrok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepSeek via llama-cpp-python (CUDA) + ngrok ‚Äî Colab (updated)

This notebook:
1) Verifies you have a GPU (T4 etc.)
2) Installs **llama-cpp-python with CUDA + server extras** (no compiling)
3) Downloads the DeepSeek GGUF model
4) Starts an **OpenAI-compatible** server on an available port (default **8081** to avoid conflicts)
5) Exposes it via **ngrok**

## Colab Secrets
- `NGROK_AUTH_TOKEN` (required)
- `HF_TOKEN` (optional; only for gated/private HF repos)


In [None]:
# 1) Confirm GPU + CUDA
!nvidia-smi -L
!nvcc --version || true
!free -h


In [None]:
# 2) Install llama-cpp-python CUDA wheel + server extras
# CUDA drivers in Colab may be 12.5; cu121 wheels generally work fine.

!pip -q uninstall -y llama-cpp-python || true
!pip -q install "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121

# Basic import sanity check
import llama_cpp
print('llama_cpp import OK')


In [None]:
# 3) (Optional) Hugging Face login (only needed for gated/private models)
!pip -q install huggingface_hub

from google.colab import userdata
from huggingface_hub import login

HF_TOKEN = userdata.get('HF_TOKEN')
if HF_TOKEN:
    login(token=HF_TOKEN)
    print('‚úÖ Logged into Hugging Face')
else:
    print('‚ÑπÔ∏è No HF_TOKEN set (fine for public repos)')


In [None]:
# 4) Download model GGUF
!mkdir -p /content/models

MODEL_URL = (
  'https://huggingface.co/Triangle104/DeepSeek-R1-Distill-Qwen-7B-uncensored-Q5_K_S-GGUF/'
  'resolve/main/deepseek-r1-distill-qwen-7b-uncensored-q5_k_s.gguf'
)
MODEL_PATH = '/content/models/deepseek.gguf'

!wget -q --show-progress -O "$MODEL_PATH" "$MODEL_URL"
!ls -lh /content/models


In [None]:
# 5) Free the port (default 8081) and start server in BACKGROUND
import subprocess, time
import requests

PORT = 8081
HOST = '127.0.0.1'
LOG = '/content/llama_server.log'

# Kill anything listening on our port + any previous llama_cpp.server
subprocess.run(f"fuser -k {PORT}/tcp", shell=True)
subprocess.run("pkill -f 'python -m llama_cpp.server'", shell=True)
time.sleep(1)

cmd = [
    'python', '-m', 'llama_cpp.server',
    '--model', '/content/models/deepseek.gguf',
    '--host', HOST,
    '--port', str(PORT),
    '--n_gpu_layers', '99',
    '--n_ctx', '2048',
]

with open(LOG, 'w') as f:
    p = subprocess.Popen(cmd, stdout=f, stderr=subprocess.STDOUT)

print('‚úÖ Server starting (PID):', p.pid)
print('Log:', LOG)

health_url = f'http://{HOST}:{PORT}/health'
for i in range(120):
    try:
        r = requests.get(health_url, timeout=1)
        if r.status_code == 200:
            print('‚úÖ Server healthy:', health_url)
            break
    except Exception:
        pass
    time.sleep(1)
else:
    print('‚ö†Ô∏è Server did not become healthy in time. Tail log:')
    subprocess.run('tail -n 200 /content/llama_server.log', shell=True)

print('\nGPU snapshot:')
subprocess.run('nvidia-smi | head -n 30', shell=True)


In [None]:
# 6) Expose via ngrok (matches PORT above)
!pip -q install pyngrok

from google.colab import userdata
from pyngrok import ngrok

NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')
if not NGROK_AUTH_TOKEN:
    raise RuntimeError('Set NGROK_AUTH_TOKEN in Colab Secrets as NGROK_AUTH_TOKEN')

ngrok.set_auth_token(NGROK_AUTH_TOKEN)
public = ngrok.connect(8081, 'http')

print('üåç Public URL:', public.public_url)
print('OpenAI base:', public.public_url + '/v1')


In [None]:
# 7) Test locally (OpenAI-style)
import requests

base = 'http://127.0.0.1:8081/v1'
payload = {
  'model': 'local',
  'messages': [
    {'role': 'user', 'content': 'Say hi in one sentence, then give 3 Linux backup tips.'}
  ],
  'temperature': 0.6
}

r = requests.post(base + '/chat/completions', json=payload, timeout=300)
print('HTTP', r.status_code)
print(r.json()['choices'][0]['message']['content'])


In [None]:
# 8) Useful: show what's listening + tail logs
!lsof -iTCP:8081 -sTCP:LISTEN -n -P || true
!tail -n 200 /content/llama_server.log
