<a href="https://colab.research.google.com/github/gitleon8301/MY-AI-Gizmo-working/blob/main/Colab-TextGen-GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# oobabooga/text-generation-webui

After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.

* Project page: https://github.com/oobabooga/text-generation-webui
* Gradio server status: https://status.gradio.app/

In [None]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
# ========================================
# PASTE THIS ENTIRE SCRIPT IN GOOGLE COLAB
# ========================================

import os, subprocess
from pathlib import Path

# Model settings (you can change these later)
MODEL_REPO = "bartowski/Llama-3.2-3B-Instruct-GGUF"
SPECIFIC_FILE = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

models_dir = Path("/content/models") / MODEL_REPO.replace("/", "_")
models_dir.mkdir(parents=True, exist_ok=True)
model_path = models_dir / SPECIFIC_FILE

os.environ["CUDA_VISIBLE_DEVICES"] = ""

def run(cmd):
    print(f"\n>>> {cmd}\n")
    subprocess.run(cmd, shell=True)

print("=" * 70)
print("‚ö° LLAMA CHAT - CPU MODE")
print("=" * 70)

# Step 1: Download model
if not model_path.exists():
    print("üì• Step 1/4: Downloading model (2.3GB)...")
    run("pip install -q huggingface_hub")
    from huggingface_hub import hf_hub_download
    hf_hub_download(
        repo_id=MODEL_REPO,
        filename=SPECIFIC_FILE,
        local_dir=str(models_dir),
        resume_download=True
    )
else:
    print("‚úì Model already downloaded")

# Step 2: Install llama-cpp-python (PRE-BUILT WHEEL - FAST!)
print("\n‚ö° Step 2/4: Installing llama-cpp-python (pre-built, no compiling)...")
run("pip install -q llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu")

# Step 3: Install Gradio
print("\nüì¶ Step 3/4: Installing Gradio...")
run("pip install -q gradio")

# Step 4: Create and run chat app
print("\n‚úÖ Step 4/4: Creating chat interface...")

chat_code = f'''
import gradio as gr
from llama_cpp import Llama

print("üîÑ Loading model into memory...")
llm = Llama(
    model_path="{model_path}",
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=0,
    verbose=False
)
print("‚úÖ Model loaded! Starting chat...")

def chat(message, history):
    messages = []
    for h in history:
        messages.append({{"role": "user", "content": h[0]}})
        messages.append({{"role": "assistant", "content": h[1]}})
    messages.append({{"role": "user", "content": message}})

    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        stream=True
    )

    partial = ""
    for chunk in response:
        if "choices" in chunk:
            delta = chunk["choices"][0].get("delta", {{}})
            if "content" in delta:
                partial += delta["content"]
                yield partial

demo = gr.ChatInterface(
    fn=chat,
    title="ü¶ô Llama 3.2 3B Chat (CPU Mode)",
    description="Running on Colab CPU - Expect 2-4 tokens/second",
    examples=[
        "Hello! Tell me a joke",
        "Write a Python function to sort a list",
        "What is machine learning?"
    ],
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear"
)

print("\\n" + "=" * 70)
print("üåê CHAT IS READY!")
print("Look for the public URL below (ends with .gradio.live)")
print("=" * 70 + "\\n")

demo.launch(share=True)
'''

Path("/content/chat.py").write_text(chat_code)

print("\nüöÄ Starting chat interface...\n")
print("=" * 70)
os.system("python /content/chat.py")