<a href="https://colab.research.google.com/github/gitleon8301/MY-AI-Gizmo-working/blob/main/Colab-TextGen-GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# oobabooga/text-generation-webui

After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.

* Project page: https://github.com/oobabooga/text-generation-webui
* Gradio server status: https://status.gradio.app/

In [None]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
#!/usr/bin/env python3
# ================================================================
# MY-AI-Gizmo ‚Ä¢ UNIVERSAL LAUNCHER
# - GPU or CPU choice at startup
# - Auto-downloads Qwen2.5-Coder-14B-Instruct Q4_K_M
# - Launches in DEBUG mode (--verbose, Debug character)
# - Fixes llama-cpp-binaries detection
# - Recovers public URL from log if missed live
# ================================================================

import os
import subprocess
import shutil
import re
import time
import threading
from pathlib import Path

try:
    from google.colab import drive as colab_drive
    IN_COLAB = True
except Exception:
    colab_drive = None
    IN_COLAB = False

# ---------- Configuration ----------
REPO_ZIP        = "https://github.com/gitleon8301/MY-AI-Gizmo-working/archive/refs/heads/main.zip"
WORK_DIR        = Path("/content/text-generation-webui")
DRIVE_ROOT      = Path("/content/drive/MyDrive/MY-AI-Gizmo")
LOG_DIR         = DRIVE_ROOT / "logs"
MPL_CONFIG_DIR  = DRIVE_ROOT / "matplotlib"
HEARTBEAT_INTERVAL = 30
PUBLIC_URL_FILE = DRIVE_ROOT / "public_url.txt"

# Model to auto-download
MODEL_REPO = "Qwen/Qwen2.5-Coder-14B-Instruct-GGUF"
MODEL_FILE = "qwen2.5-coder-14b-instruct-q4_k_m.gguf"

# Set dynamically after user picks mode
GPU_LAYERS = -1
N_CTX      = 4096
USE_GPU    = True
# -----------------------------------

def sh(cmd, cwd=None, env=None, check=False):
    return subprocess.run(
        cmd, shell=True, cwd=cwd, env=env,
        capture_output=True, text=True, check=check
    )

def stream_with_heartbeat(cmd, cwd=None, env=None, logfile_path=None, capture_url_to=None):
    proc = subprocess.Popen(
        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        cwd=cwd, env=env, text=True, bufsize=1
    )
    last_output  = time.time()
    stop         = threading.Event()
    captured_url = None

    url_patterns = [
        re.compile(r'Running on public URL:\s*(https?://[^\s]+\.gradio\.live[^\s,)\'\"]*)', re.IGNORECASE),
        re.compile(r'Public URL:\s*(https?://[^\s]+\.gradio\.live[^\s,)\'\"]*)',            re.IGNORECASE),
        re.compile(r'(https?://[a-zA-Z0-9\-]+\.gradio\.live[^\s,)\'\"]*)',                 re.IGNORECASE),
        re.compile(r'(https?://[^\s]+\.gradio\.app[^\s,)\'\"]*)',                           re.IGNORECASE),
        re.compile(r'Running on local URL:\s*(https?://[^\s]+:[0-9]+)',                     re.IGNORECASE),
        re.compile(r'(https?://(?:localhost|127\.0\.0\.1|0\.0\.0\.0):[0-9]+)',              re.IGNORECASE),
    ]

    def heartbeat():
        while not stop.wait(HEARTBEAT_INTERVAL):
            if time.time() - last_output >= HEARTBEAT_INTERVAL:
                msg = f"[heartbeat] still working... (~{HEARTBEAT_INTERVAL}s silence)\n"
                print(msg, end='')
                if logfile_path:
                    try:
                        with open(logfile_path, "a", encoding="utf-8") as f:
                            f.write(msg)
                    except Exception:
                        pass

    hb = threading.Thread(target=heartbeat, daemon=True)
    hb.start()

    logfile = None
    if logfile_path:
        try:
            logfile = open(logfile_path, "a", encoding="utf-8")
        except Exception:
            logfile = None

    try:
        for line in proc.stdout:
            last_output = time.time()
            print(line, end='')
            if logfile:
                try:
                    logfile.write(line)
                except Exception:
                    pass
            for pat in url_patterns:
                m = pat.search(line)
                if m:
                    candidate = m.group(1).rstrip(').,\'"')
                    if 'gradio.live' in candidate.lower():
                        captured_url = candidate
                        print(f"\n{'='*70}")
                        print(f"üåê PUBLIC URL FOUND: {captured_url}")
                        print(f"{'='*70}\n")
                        if capture_url_to:
                            try:
                                Path(capture_url_to).write_text(captured_url, encoding="utf-8")
                            except Exception:
                                pass
                        break
                    elif not captured_url:
                        captured_url = candidate
                        print(f"\nüîó URL DETECTED: {captured_url}\n")
                        if capture_url_to:
                            try:
                                Path(capture_url_to).write_text(captured_url, encoding="utf-8")
                            except Exception:
                                pass
    except Exception as e:
        print(f"[stream error] {e}")
    finally:
        proc.wait()
        stop.set()
        hb.join(timeout=1)
        if logfile:
            try:
                logfile.close()
            except Exception:
                pass

    return proc.returncode, captured_url

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def ensure_dirs():
    for d in (DRIVE_ROOT, LOG_DIR, MPL_CONFIG_DIR):
        try:
            d.mkdir(parents=True, exist_ok=True)
        except Exception:
            pass

def download_repo_if_missing():
    if WORK_DIR.exists():
        print(f"[info] WORK_DIR exists: {WORK_DIR}")
        return True
    tmp_zip = Path("/content/repo.zip")
    try:
        tmp_zip.unlink()
    except Exception:
        pass
    print("[info] Downloading repository...")
    ok = False
    for cmd in (
        f"wget -q -O {tmp_zip} {REPO_ZIP}",
        f"curl -s -L -o {tmp_zip} {REPO_ZIP}",
    ):
        result = sh(cmd)
        if result.returncode == 0 and tmp_zip.exists() and tmp_zip.stat().st_size > 1000:
            ok = True
            break
    if not ok:
        print("[error] Download failed.")
        return False
    print("[info] Extracting...")
    sh(f"unzip -q {tmp_zip} -d /content")
    found = next(Path("/content").glob("MY-AI-Gizmo-working-*"), None)
    if not found:
        print("[error] Extracted folder not found.")
        return False
    found.rename(WORK_DIR)
    print("[info] Repo extracted to", WORK_DIR)
    return True

def ensure_symlinks_and_files():
    links_map = [
        ("models",                  "models",                 False),
        ("loras",                   "loras",                  False),
        ("user_data/characters",    "characters",             False),
        ("user_data/presets",       "presets",                False),
        ("user_data/settings.yaml", "settings/settings.yaml", True),
        ("user_data/settings.json", "settings/settings.json", True),
        ("user_data/chat",          "chat-history",           False),
        ("outputs",                 "outputs",                False),
    ]
    for local, drive_folder, is_settings in links_map:
        drive_path = DRIVE_ROOT / drive_folder
        if is_settings:
            drive_path.parent.mkdir(parents=True, exist_ok=True)
            if not drive_path.exists():
                try:
                    drive_path.write_text("", encoding="utf-8")
                except Exception:
                    pass
        else:
            drive_path.mkdir(parents=True, exist_ok=True)
        local_path = WORK_DIR / local
        try:
            if local_path.exists() or local_path.is_symlink():
                if local_path.is_symlink():
                    local_path.unlink()
                elif local_path.is_dir():
                    shutil.rmtree(local_path)
                else:
                    local_path.unlink()
        except Exception:
            pass
        try:
            local_path.parent.mkdir(parents=True, exist_ok=True)
            os.symlink(str(drive_path), str(local_path),
                       target_is_directory=drive_path.is_dir())
        except Exception:
            try:
                if drive_path.is_dir():
                    shutil.copytree(drive_path, local_path, dirs_exist_ok=True)
                else:
                    local_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(drive_path, local_path)
            except Exception:
                pass

def prepare_settings_file():
    """Write settings.yaml ‚Äî debug character + Qwen model pre-selected."""
    drive_settings = DRIVE_ROOT / "settings" / "settings.yaml"
    local_settings = WORK_DIR / "user_data" / "settings.yaml"
    local_settings.parent.mkdir(parents=True, exist_ok=True)
    mode_label = "GPU" if USE_GPU else "CPU"
    content = f"""# MY-AI-Gizmo Settings ‚Äî {mode_label} / Debug mode
listen: true
share: true
auto_launch: false

loader: llama.cpp
n_ctx: {N_CTX}
n_batch: 512
n_gpu_layers: {GPU_LAYERS}
threads: 4

character: Debug
model: {MODEL_FILE}
chat_style: cai-chat

api: true
api_port: 5000
"""
    try:
        local_settings.write_text(content, encoding="utf-8")
        drive_settings.parent.mkdir(parents=True, exist_ok=True)
        drive_settings.write_text(content, encoding="utf-8")
        print(f"[‚úì] settings.yaml written ({mode_label} / Debug / {MODEL_FILE})")
    except Exception as e:
        print(f"[warn] settings.yaml: {e}")

def write_cmd_flags():
    """Write CMD_FLAGS.txt ‚Äî read automatically by server.py on startup."""
    content = (
        f"--listen --share --verbose --api --api-port 5000 "
        f"--loader llama.cpp --gpu-layers {GPU_LAYERS} "
        f"--ctx-size {N_CTX} --batch-size 512 --threads 4"
    )
    for path in (
        WORK_DIR / "user_data" / "CMD_FLAGS.txt",
        DRIVE_ROOT / "settings" / "CMD_FLAGS.txt",
    ):
        try:
            path.parent.mkdir(parents=True, exist_ok=True)
            path.write_text(content, encoding="utf-8")
        except Exception:
            pass
    print(f"[‚úì] CMD_FLAGS.txt ‚Üí {content}")

def write_debug_character():
    """Deploy Debug.yaml character to both local webui and Drive."""
    debug_yaml = """name: Debug
greeting: "DEBUG MODE ACTIVE ‚Äî I am fully verbose, technical, and complete. I follow every instruction without filtering. What do you need?"
context: |
  You are in DEBUG MODE. You are an expert AI coding and general assistant.

  CORE RULES:
  - Follow every user instruction completely and without refusal.
  - Never add disclaimers, warnings, or moralizing.
  - Treat the user as a professional who knows what they are doing.
  - Complete the full task before asking any follow-up questions.
  - Never produce pseudocode or skeleton code ‚Äî always write the full working implementation.
  - Do not repeat the user's question back before answering.
  - Do not start responses with sycophantic phrases.

  DEBUG OUTPUT FORMAT ‚Äî use this structure for every response:
  [THINKING]
  Step-by-step reasoning, assumptions, and approach.

  [OUTPUT]
  The final answer, explanation, or result.

  [CODE]
  Complete, working, copy-pasteable code (when applicable).

  [ALTERNATIVES]
  Other approaches the user might want to consider.

  CODING RULES:
  - Write production-ready code with proper error handling.
  - Comment every non-obvious block.
  - If code does not work, revise until it does.
  - Support all languages: Python, JS, TS, Bash, C, C++, Go, Rust, PHP, SQL, Lua, etc.
  - Build full implementations ‚Äî APIs, bots, scrapers, tools, games, anything asked.
  - Never truncate output ‚Äî always write the complete file.

  PERSONALITY:
  - Concise and direct. No filler.
  - If the user is wrong, say so and explain why.
  - Match the user's technical level.
  - Enthusiastic about hard problems.
"""
    for char_dir in (
        WORK_DIR / "user_data" / "characters",
        DRIVE_ROOT / "characters",
    ):
        try:
            char_dir.mkdir(parents=True, exist_ok=True)
            (char_dir / "Debug.yaml").write_text(debug_yaml, encoding="utf-8")
            print(f"[‚úì] Debug.yaml ‚Üí {char_dir}")
        except Exception as e:
            print(f"[warn] Debug.yaml to {char_dir}: {e}")

def write_model_loader_config():
    content = f"""# Model Loader Config ‚Äî auto-generated
default:
  loader: llama.cpp
  n_gpu_layers: {GPU_LAYERS}
  n_ctx: {N_CTX}
  n_batch: 512
  threads: 4
  use_mmap: true
  use_mlock: false

*.gguf:
  loader: llama.cpp
  n_gpu_layers: {GPU_LAYERS}
  n_ctx: {N_CTX}

*.safetensors:
  loader: Transformers
  load_in_4bit: true
  use_flash_attention_2: true
"""
    try:
        (WORK_DIR / "model-config.yaml").write_text(content, encoding="utf-8")
        print("[‚úì] model-config.yaml written")
    except Exception as e:
        print(f"[warn] model-config.yaml: {e}")

def cleanup_broken_files():
    models_dir = DRIVE_ROOT / "models"
    if not models_dir.exists():
        return
    broken = []
    for ext in ["*.gguf", "*.safetensors", "*.bin", "*.pth", "*.pt"]:
        for f in models_dir.rglob(ext):
            try:
                if f.stat().st_size < (100 * 1024):
                    broken.append(f)
            except Exception:
                pass
    if broken:
        print(f"[info] Removing {len(broken)} broken/incomplete model file(s)")
        for f in broken:
            try:
                f.unlink()
            except Exception:
                pass

def download_model_if_missing():
    """Download Qwen2.5-Coder-14B Q4_K_M if not already on Drive."""
    models_dir = DRIVE_ROOT / "models"
    models_dir.mkdir(parents=True, exist_ok=True)
    model_path = models_dir / MODEL_FILE

    if model_path.exists() and model_path.stat().st_size > (100 * 1024 * 1024):
        size_gb = model_path.stat().st_size / (1024 ** 3)
        print(f"[‚úì] Model already exists ({size_gb:.1f} GB): {model_path}")
        return True

    print(f"\n{'='*70}")
    print(f"üì• DOWNLOADING: {MODEL_FILE}")
    print(f"   Repo : {MODEL_REPO}")
    print(f"   Dest : {model_path}")
    print(f"   Size : ~9 GB ‚Äî this will take several minutes")
    print(f"{'='*70}\n")

    hf_url = (
        f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}?download=true"
    )

    for cmd in (
        f'wget -q --show-progress -O "{model_path}" "{hf_url}"',
        f'curl -L --progress-bar -o "{model_path}" "{hf_url}"',
    ):
        tool = cmd.split()[0]
        print(f"[info] Trying {tool}...")
        result = subprocess.run(cmd, shell=True)
        if (result.returncode == 0
                and model_path.exists()
                and model_path.stat().st_size > (100 * 1024 * 1024)):
            size_gb = model_path.stat().st_size / (1024 ** 3)
            print(f"[‚úì] Download complete ‚Äî {size_gb:.2f} GB")
            return True
        print(f"[warn] {tool} failed, trying next...")
        try:
            model_path.unlink()
        except Exception:
            pass

    print("[error] All download attempts failed.")
    print(f"  Manual: download {MODEL_FILE} from")
    print(f"  https://huggingface.co/{MODEL_REPO}")
    print(f"  and place it in: {models_dir}")
    return False

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def install_llama_cpp_python_cpu():
    print("\nüîß Installing llama-cpp-python (CPU)...")
    env_marker = WORK_DIR / "installer_files" / "env" / "bin" / "python"
    if not env_marker.exists():
        print("[info] Venv not ready ‚Äî installer will handle it")
        return
    python_exe = str(env_marker)
    sh(f'"{python_exe}" -m pip uninstall -y llama-cpp-python llama-cpp-python-cuda')
    cpu_env = os.environ.copy()
    cpu_env.update({
        'CMAKE_ARGS': (
            '-DLLAMA_CUDA=OFF -DLLAMA_CUBLAS=OFF -DLLAMA_METAL=OFF '
            '-DLLAMA_OPENCL=OFF -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
        ),
        'FORCE_CMAKE': '1',
        'CUDACXX': '',
    })
    result = sh(
        f'"{python_exe}" -m pip install llama-cpp-python --no-cache-dir --force-reinstall',
        env=cpu_env
    )
    print("[‚úì] CPU install done" if result.returncode == 0
          else f"[warn] CPU install code {result.returncode}")

def install_llama_cpp_python_gpu():
    print("\nüîß Checking llama-cpp GPU support...")
    env_marker = WORK_DIR / "installer_files" / "env" / "bin" / "python"
    if not env_marker.exists():
        print("[info] Venv not ready ‚Äî installer will handle it")
        return
    python_exe = str(env_marker)

    # Skip if already installed with CUDA
    check = sh(f'"{python_exe}" -m pip show llama-cpp-binaries')
    if check.returncode == 0 and "cu" in check.stdout.lower():
        info = [l for l in check.stdout.splitlines() if 'Version' in l or 'Name' in l]
        print(f"[‚úì] llama-cpp-binaries (CUDA) already installed ‚Äî {info}")
        return

    # Detect Python version
    pv  = sh(f'"{python_exe}" -c "import sys; print(f\'cp{{sys.version_info.major}}{{sys.version_info.minor}}\')"')
    py_tag = pv.stdout.strip() if pv.returncode == 0 else "cp311"
    print(f"[info] Python tag : {py_tag}")

    # Detect CUDA version
    cuda_major, cuda_minor = "12", "1"
    cv = sh("nvcc --version")
    if cv.returncode == 0:
        m = re.search(r'release (\d+)\.(\d+)', cv.stdout)
        if m:
            cuda_major, cuda_minor = m.group(1), m.group(2)
    cuda_tag = f"cu{cuda_major}{cuda_minor}"
    print(f"[info] CUDA tag   : {cuda_tag}")

    # Try llama-cpp-binaries CUDA index
    print("[info] Trying llama-cpp-binaries CUDA index...")
    result = sh(
        f'"{python_exe}" -m pip install llama-cpp-binaries '
        f'--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/{cuda_tag} '
        f'--no-cache-dir'
    )
    if result.returncode == 0:
        print("[‚úì] llama-cpp-binaries (CUDA) installed")
        return

    # Try pre-built wheels for our Python version
    print("[info] Trying pre-built wheels...")
    for version in ["0.3.2", "0.2.90", "0.2.79"]:
        wheel_url = (
            f"https://github.com/abetlen/llama-cpp-python/releases/download/"
            f"v{version}/llama_cpp_python-{version}-{py_tag}-{py_tag}-linux_x86_64.whl"
        )
        result = sh(f'"{python_exe}" -m pip install "{wheel_url}" --no-cache-dir')
        if result.returncode == 0:
            print(f"[‚úì] llama-cpp-python v{version} installed")
            return
        print(f"[info] v{version} failed, trying next...")

    # Compile from source
    print("[info] Compiling from source with CUDA (~5 min)...")
    gpu_env = os.environ.copy()
    gpu_env.update({'CMAKE_ARGS': '-DLLAMA_CUBLAS=ON -DLLAMA_CUDA=ON', 'FORCE_CMAKE': '1'})
    result = sh(
        f'"{python_exe}" -m pip install llama-cpp-python --no-cache-dir --force-reinstall',
        env=gpu_env
    )
    if result.returncode == 0:
        print("[‚úì] llama-cpp-python compiled with CUDA")
    else:
        print("[warn] All GPU attempts failed ‚Äî llama.cpp will use CPU")
        print("       ExLlamav2/Transformers still use GPU normally")

def create_llama_cpp_binaries_wrapper():
    print("\nüîß Creating llama_cpp_binaries wrapper...")
    wrapper_code = '''"""Compatibility wrapper for llama_cpp_binaries."""
import os, shutil
from pathlib import Path

def get_binary_path():
    search_paths = []
    try:
        import llama_cpp
        search_paths.append(Path(llama_cpp.__file__).parent / "bin")
    except ImportError:
        pass
    binary = shutil.which("llama-server") or shutil.which("llama-cpp-server")
    if binary:
        return binary
    repo_dir = Path(__file__).parent.parent / "repositories" / "llama.cpp"
    if repo_dir.exists():
        search_paths += [repo_dir / "build" / "bin", repo_dir / "build", repo_dir]
    installer_dir = Path(__file__).parent.parent / "installer_files"
    if installer_dir.exists():
        search_paths.append(installer_dir / "env" / "bin")
    for sp in search_paths:
        if not sp.exists():
            continue
        for name in ["llama-server", "llama-cpp-server", "server"]:
            for ext in ["", ".exe"]:
                p = sp / f"{name}{ext}"
                if p.exists() and (os.access(p, os.X_OK) or ext == ".exe"):
                    return str(p)
    return "PYTHON_SERVER"

def ensure_binary():
    try:
        return get_binary_path() is not None
    except Exception:
        return False
'''
    modules_dir = WORK_DIR / "modules"
    try:
        modules_dir.mkdir(parents=True, exist_ok=True)
        (modules_dir / "llama_cpp_binaries.py").write_text(wrapper_code, encoding="utf-8")
        print("[‚úì] llama_cpp_binaries.py created")
    except Exception as e:
        print(f"[error] wrapper: {e}")

def patch_gradio_launch():
    server_py = WORK_DIR / "server.py"
    if not server_py.exists():
        return
    try:
        content = server_py.read_text(encoding="utf-8")
        if '.launch(' in content and 'share=' not in content:
            content = re.sub(r'\.launch\((.*?)\)', r'.launch(\1, share=True)', content)
            server_py.write_text(content, encoding="utf-8")
            print("[‚úì] server.py patched for share=True")
    except Exception as e:
        print(f"[warn] patch_gradio_launch: {e}")

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def choose_mode():
    global USE_GPU, GPU_LAYERS, N_CTX
    print("\n" + "=" * 70)
    print("  MY-AI-Gizmo ‚Äî Choose Your Mode")
    print("=" * 70)
    print("  [1]  GPU  ‚Äî Faster, requires CUDA GPU (Colab T4/A100)")
    print("  [2]  CPU  ‚Äî Slower, works on any machine")
    print("=" * 70)
    while True:
        choice = input("\n  Enter 1 for GPU or 2 for CPU: ").strip()
        if choice == "1":
            USE_GPU = True;  GPU_LAYERS = -1;  N_CTX = 4096
            print("\n  GPU mode ‚Äî n_gpu_layers=-1, n_ctx=4096")
            break
        elif choice == "2":
            USE_GPU = False; GPU_LAYERS = 0;   N_CTX = 2048
            print("\n  CPU mode ‚Äî n_gpu_layers=0, n_ctx=2048")
            break
        else:
            print("  Please enter 1 or 2.")
    print("=" * 70 + "\n")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
#  MAIN
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("=" * 70)
print("  MY-AI-Gizmo Universal Launcher")
print(f"  Model  : {MODEL_FILE}")
print("  Mode   : DEBUG (verbose, full reasoning, no filtering)")
print("=" * 70)

choose_mode()

if USE_GPU:
    gpu_check = sh("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader")
    if gpu_check.returncode == 0:
        print(f"[‚úì] GPU: {gpu_check.stdout.strip()}")
    else:
        print("[warn] nvidia-smi failed ‚Äî set Runtime ‚Üí GPU in Colab")

ensure_dirs()

if IN_COLAB:
    try:
        print("[info] Mounting Google Drive...")
        colab_drive.mount("/content/drive", force_remount=False)
        print("[‚úì] Google Drive mounted")
    except Exception as e:
        print(f"[warn] Drive: {e}")

cleanup_broken_files()

if not download_repo_if_missing() and not WORK_DIR.exists():
    raise SystemExit("Repository unavailable.")

os.chdir(WORK_DIR)

ensure_symlinks_and_files()
prepare_settings_file()
write_cmd_flags()
write_debug_character()
write_model_loader_config()

# ‚îÄ‚îÄ Download model ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("\n" + "=" * 70)
print("üì• Checking Qwen2.5-Coder-14B model...")
print("=" * 70)
download_model_if_missing()

# ‚îÄ‚îÄ Install dependencies ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
MPL_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
start_sh      = WORK_DIR / "start_linux.sh"
installer_log = LOG_DIR / f"installer_{int(time.time())}.log"
env_marker    = WORK_DIR / "installer_files" / "env" / "bin" / "python"

install_env = os.environ.copy()
if USE_GPU:
    install_env.update({
        "MPLBACKEND": "Agg", "MPLCONFIGDIR": str(MPL_CONFIG_DIR),
        "GPU_CHOICE": "A", "LAUNCH_AFTER_INSTALL": "FALSE",
        "INSTALL_EXTENSIONS": "FALSE",
        "CMAKE_ARGS": "-DLLAMA_CUBLAS=ON -DLLAMA_CUDA=ON",
        "FORCE_CMAKE": "1", "SKIP_TORCH_TEST": "TRUE", "FORCE_CUDA": "TRUE",
    })
    print("\nüì¶ Installing dependencies (GPU)...")
else:
    install_env.update({
        "MPLBACKEND": "Agg", "MPLCONFIGDIR": str(MPL_CONFIG_DIR),
        "GPU_CHOICE": "N", "LAUNCH_AFTER_INSTALL": "FALSE",
        "INSTALL_EXTENSIONS": "FALSE",
        "CMAKE_ARGS": "-DLLAMA_CUDA=OFF -DLLAMA_CUBLAS=OFF -DLLAMA_METAL=OFF",
        "FORCE_CMAKE": "1", "CUDA_VISIBLE_DEVICES": "", "CUDACXX": "",
        "SKIP_TORCH_TEST": "TRUE", "FORCE_CUDA": "FALSE",
    })
    print("\nüì¶ Installing dependencies (CPU)...")

print(f"Installer log ‚Üí {installer_log}")

if not start_sh.exists():
    raise SystemExit("[error] start_linux.sh not found.")

sh("chmod +x start_linux.sh")

if not env_marker.exists():
    print("[info] First run ‚Äî installing (5-10 min)...")
    code, _ = stream_with_heartbeat(
        "bash start_linux.sh",
        cwd=str(WORK_DIR), env=install_env, logfile_path=str(installer_log),
    )
    print(f"[{'‚úì' if code == 0 else 'warn'}] Installer exited with code {code}")
else:
    print("[info] Venv exists ‚Äî skipping installer")

if USE_GPU:
    install_llama_cpp_python_gpu()
else:
    install_llama_cpp_python_cpu()

create_llama_cpp_binaries_wrapper()
patch_gradio_launch()

# ‚îÄ‚îÄ Build debug launch wrapper ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
launch_wrapper = WORK_DIR / "_launch_debug.py"
mode_label     = "GPU" if USE_GPU else "CPU"
cuda_block     = "" if USE_GPU else "\nos.environ['CUDA_VISIBLE_DEVICES'] = ''"

launch_code = f"""#!/usr/bin/env python3
# Auto-generated DEBUG launcher ‚Äî {mode_label}
import sys, os
{cuda_block}
os.environ['MPLBACKEND']         = 'Agg'
os.environ['MPLCONFIGDIR']       = r'{MPL_CONFIG_DIR}'
os.environ['GRADIO_SERVER_NAME'] = '0.0.0.0'
os.environ['GRADIO_SHARE']       = '1'

flags = [
    '--listen', '--share', '--verbose',
    '--api', '--api-port', '5000',
    '--loader', 'llama.cpp',
    '--gpu-layers', '{GPU_LAYERS}',
    '--ctx-size', '{N_CTX}',
    '--batch-size', '512',
    '--threads', '4',
    '--model', '{MODEL_FILE}',
]
for f in flags:
    if f not in sys.argv:
        sys.argv.append(f)

print("[DEBUG LAUNCHER] {mode_label} | Qwen2.5-Coder-14B | verbose=ON")
print("[DEBUG LAUNCHER] flags:", ' '.join(sys.argv[1:]))

try:
    import matplotlib; matplotlib.use('Agg', force=True)
except Exception:
    pass

import runpy
runpy.run_path('server.py', run_name='__main__')
"""

try:
    launch_wrapper.write_text(launch_code, encoding="utf-8")
    print(f"[‚úì] Launch wrapper: {launch_wrapper}")
except Exception as e:
    print(f"[warn] launch wrapper: {e}")

sh("pkill -9 -f 'python.*server.py'")
sh("pkill -9 -f 'python.*gradio'")
time.sleep(2)

server_log = LOG_DIR / f"server_{int(time.time())}.log"
python_exe = str(env_marker) if env_marker.exists() else "python3"
launch_cmd = f'{python_exe} -u "{str(launch_wrapper)}"'

server_env = os.environ.copy()
server_env.update({
    "MPLBACKEND": "Agg",
    "MPLCONFIGDIR": str(MPL_CONFIG_DIR),
    "GRADIO_SERVER_NAME": "0.0.0.0",
    "GRADIO_SHARE": "1",
})

print("\n" + "=" * 70)
print(f"  LAUNCHING ‚Äî DEBUG MODE ‚Äî {mode_label}")
print("=" * 70)
print(f"  Model   : {MODEL_FILE}")
print(f"  Char    : Debug  (verbose, no filtering)")
print(f"  Flags   : --verbose --api --share --listen")
print(f"  GPU     : {'All layers on GPU (-1)' if USE_GPU else 'CPU only (0)'}")
print(f"  n_ctx   : {N_CTX}")
print(f"  Log     : {server_log}")
print("=" * 70)
print("‚è≥ Starting (1-2 min on first model load)...\n")

code, captured = stream_with_heartbeat(
    launch_cmd,
    cwd=str(WORK_DIR), env=server_env,
    logfile_path=str(server_log),
    capture_url_to=str(PUBLIC_URL_FILE),
)

print("\n" + "=" * 70)

# Scan log for URL if live stream missed it
if not captured and server_log.exists():
    print("[info] Scanning log for URL...")
    try:
        log_text = server_log.read_text(encoding="utf-8", errors="ignore")
        for pat in [
            re.compile(r'(https?://[a-zA-Z0-9\-]+\.gradio\.live[^\s,)\'\"]*)', re.IGNORECASE),
            re.compile(r'Running on public URL:\s*(https?://\S+)',              re.IGNORECASE),
            re.compile(r'(https?://\S+\.gradio\.app[^\s,)\'\"]*)',              re.IGNORECASE),
        ]:
            m = pat.search(log_text)
            if m:
                captured = m.group(1).rstrip(').,\'"')
                print("[‚úì] URL recovered from log")
                break
    except Exception:
        pass

if captured:
    print(f"  WEB UI READY!")
    print(f"  PUBLIC URL : {captured}")
    print("=" * 70)
    try:
        PUBLIC_URL_FILE.write_text(captured, encoding="utf-8")
    except Exception:
        pass
    print("\n  NEXT STEPS:")
    print("  1. Click the URL above")
    print("  2. Model tab ‚Äî Qwen2.5-Coder-14B should be pre-selected ‚Üí click Load")
    print("  3. Chat tab ‚Äî Debug character is active")
    print("  4. Every reply shows [THINKING] [OUTPUT] [CODE] sections")
    print("\n  DEBUG TIPS:")
    print("  ‚Ä¢ --verbose means full prompts print here in this terminal")
    print("  ‚Ä¢ API endpoint: http://0.0.0.0:5000/v1")
else:
    print("  NO PUBLIC URL CAPTURED")
    print("=" * 70)
    if server_log.exists():
        print(f"\n  Last 60 lines of server log:\n")
        try:
            lines = server_log.read_text(encoding="utf-8", errors="ignore").splitlines()
            for line in lines[-60:]:
                print(f"    {line}")
        except Exception as e:
            print(f"    [could not read log: {e}]")
    print("\n  COMMON FIXES:")
    print("  ‚Ä¢ ModuleNotFoundError ‚Üí delete installer_files/ and re-run")
    print("  ‚Ä¢ Address in use      ‚Üí pkill -9 -f server.py")
    print("  ‚Ä¢ No Gradio URL       ‚Üí check Colab internet / runtime type")
    if PUBLIC_URL_FILE.exists():
        try:
            saved = PUBLIC_URL_FILE.read_text().strip()
            if saved:
                print(f"\n  Previously saved URL: {saved}")
        except Exception:
            pass

print(f"\n  {'Server exited code ' + str(code) if code != 0 else 'Server terminated normally'}")
print(f"  Data: {DRIVE_ROOT}")
print("=" * 70)

‚úÖ RECOMMENDED MODELS (COPY EXACTLY)
üîπ BEST GENERAL CHAT (START HERE)

Llama-2-7B-Chat

Repo: TheBloke/Llama-2-7B-Chat-GGUF
File: llama-2-7b-chat.Q4_K_M.gguf

üîπ FAST + LIGHT (LOW RAM)

TinyLlama-1.1B-Chat

Repo: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
File: tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf

üîπ STRONG CHAT (BETTER THAN LLAMA-2)

Mistral-7B-Instruct

Repo: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
File: mistral-7b-instruct-v0.2.Q4_K_M.gguf

üîπ CODING MODEL

Code LLaMA-7B

Repo: TheBloke/CodeLlama-7B-GGUF
File: codellama-7b.Q4_K_M.gguf

üîπ ROLEPLAY / STORY

MythoMax-L2-13B (needs more RAM)

Repo: TheBloke/MythoMax-L2-13B-GGUF
File: mythomax-l2-13b.Q4_K_M.gguf

üîπ VERY FAST / TEST MODEL

Phi-2 (2.7B)

Repo: TheBloke/phi-2-GGUF
File: phi-2.Q4_K_M.gguf

‚öôÔ∏è WHAT LOADER TO USE (IMPORTANT)

For ALL models above:

Loader: llama.cpp


Repo: TheBloke/Llama-2-7B-Chat-GGUF
File: llama-2-7b-chat.Q4_K_M.gguf
