<a href="https://colab.research.google.com/github/gitleon8301/MY-AI-Gizmo-working/blob/main/Colab-TextGen-GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# oobabooga/text-generation-webui

After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.

* Project page: https://github.com/oobabooga/text-generation-webui
* Gradio server status: https://status.gradio.app/

In [None]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
#!/usr/bin/env python3
# ================================================================
# MY-AI-Gizmo ‚Ä¢ UNIVERSAL LAUNCHER (GPU + CPU)
# - Asks you at startup: GPU or CPU?
# - Installs correct llama-cpp-python build for your choice
# - Ensures public URL generation
# ================================================================

import os
import subprocess
import shutil
import re
import time
import threading
from pathlib import Path

try:
    from google.colab import drive as colab_drive
    IN_COLAB = True
except Exception:
    colab_drive = None
    IN_COLAB = False

# ---------- Configuration ----------
REPO_ZIP = "https://github.com/gitleon8301/MY-AI-Gizmo-working/archive/refs/heads/main.zip"
WORK_DIR = Path("/content/text-generation-webui")
DRIVE_ROOT = Path("/content/drive/MyDrive/MY-AI-Gizmo")
LOG_DIR = DRIVE_ROOT / "logs"
MPL_CONFIG_DIR = DRIVE_ROOT / "matplotlib"
HEARTBEAT_INTERVAL = 30  # seconds
PUBLIC_URL_FILE = DRIVE_ROOT / "public_url.txt"
GPU_LAYERS = -1   # set dynamically after user picks mode
N_CTX = 2048      # set dynamically after user picks mode
USE_GPU = True    # set dynamically after user picks mode
# -----------------------------------

def sh(cmd, cwd=None, env=None, check=False):
    return subprocess.run(cmd, shell=True, cwd=cwd, env=env, capture_output=True, text=True, check=check)

def stream_with_heartbeat(cmd, cwd=None, env=None, logfile_path=None, capture_url_to=None):
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                            cwd=cwd, env=env, text=True, bufsize=1)

    last_output = time.time()
    stop = threading.Event()
    captured_url = None

    url_patterns = [
        re.compile(r'Running on public URL:\s*(https?://[^\s]+\.gradio\.live[^\s,)\'\"]*)', re.IGNORECASE),
        re.compile(r'Public URL:\s*(https?://[^\s]+\.gradio\.live[^\s,)\'\"]*)', re.IGNORECASE),
        re.compile(r'(https?://[a-zA-Z0-9\-]+\.gradio\.live[^\s,)\'\"]*)', re.IGNORECASE),
        re.compile(r'(https?://[^\s]+\.gradio\.app[^\s,)\'\"]*)', re.IGNORECASE),
        re.compile(r'Running on local URL:\s*(https?://[^\s]+:[0-9]+)', re.IGNORECASE),
        re.compile(r'(https?://(?:localhost|127\.0\.0\.1|0\.0\.0\.0):[0-9]+)', re.IGNORECASE),
    ]

    def heartbeat():
        while not stop.wait(HEARTBEAT_INTERVAL):
            if time.time() - last_output >= HEARTBEAT_INTERVAL:
                msg = f"[heartbeat] still working... (no output for ~{HEARTBEAT_INTERVAL}s)\n"
                print(msg, end='')
                if logfile_path:
                    try:
                        with open(logfile_path, "a", encoding="utf-8") as f:
                            f.write(msg)
                    except Exception:
                        pass

    hb = threading.Thread(target=heartbeat, daemon=True)
    hb.start()

    logfile = None
    if logfile_path:
        try:
            logfile = open(logfile_path, "a", encoding="utf-8")
        except Exception:
            logfile = None

    try:
        for line in proc.stdout:
            last_output = time.time()
            print(line, end='')
            if logfile:
                try:
                    logfile.write(line)
                except Exception:
                    pass

            for pat in url_patterns:
                m = pat.search(line)
                if m:
                    candidate = m.group(1).rstrip(').,\'"')
                    if 'gradio.live' in candidate.lower():
                        captured_url = candidate
                        print(f"\n{'='*70}")
                        print(f"üåê PUBLIC URL FOUND: {captured_url}")
                        print(f"{'='*70}\n")
                        if capture_url_to:
                            try:
                                Path(capture_url_to).write_text(captured_url, encoding="utf-8")
                            except Exception:
                                pass
                        break
                    elif not captured_url:
                        captured_url = candidate
                        print(f"\nüîó URL DETECTED: {captured_url}\n")
                        if capture_url_to:
                            try:
                                Path(capture_url_to).write_text(captured_url, encoding="utf-8")
                            except Exception:
                                pass

    except Exception as e:
        print(f"[stream error] {e}")
    finally:
        proc.wait()
        stop.set()
        hb.join(timeout=1)
        if logfile:
            try:
                logfile.close()
            except Exception:
                pass

    return proc.returncode, captured_url

def ensure_dirs():
    for d in (DRIVE_ROOT, LOG_DIR, MPL_CONFIG_DIR):
        try:
            d.mkdir(parents=True, exist_ok=True)
        except Exception:
            pass

def download_repo_if_missing():
    if WORK_DIR.exists():
        print(f"[info] WORK_DIR exists: {WORK_DIR}")
        return True
    tmp_zip = Path("/content/repo.zip")
    try:
        tmp_zip.unlink()
    except Exception:
        pass
    print("[info] downloading repository...")
    ok = False
    for cmd in (f"wget -q -O {tmp_zip} {REPO_ZIP}", f"curl -s -L -o {tmp_zip} {REPO_ZIP}"):
        try:
            result = sh(cmd)
            if result.returncode == 0 and tmp_zip.exists() and tmp_zip.stat().st_size > 1000:
                ok = True
                break
        except Exception:
            pass
    if not ok:
        print("[error] download failed. Check network/URL.")
        return False
    print("[info] extracting...")
    try:
        sh(f"unzip -q {tmp_zip} -d /content")
        found = next(Path("/content").glob("MY-AI-Gizmo-working-*"), None)
        if not found:
            print("[error] expected extracted folder not found")
            return False
        found.rename(WORK_DIR)
        print("[info] repo extracted to", WORK_DIR)
        return True
    except Exception as e:
        print("[error] extract failed:", e)
        return False

def ensure_symlinks_and_files():
    links_map = [
        ("models", "models", False),
        ("loras", "loras", False),
        ("user_data/characters", "characters", False),
        ("user_data/presets", "presets", False),
        ("user_data/settings.yaml", "settings/settings.yaml", True),
        ("user_data/settings.json", "settings/settings.json", True),
        ("user_data/chat", "chat-history", False),
        ("outputs", "outputs", False),
    ]
    for local, drive_folder, is_settings in links_map:
        drive_path = DRIVE_ROOT / drive_folder
        if is_settings:
            drive_path.parent.mkdir(parents=True, exist_ok=True)
            if not drive_path.exists():
                try:
                    drive_path.write_text("", encoding="utf-8")
                except Exception:
                    pass
        else:
            drive_path.mkdir(parents=True, exist_ok=True)
        local_path = WORK_DIR / local
        try:
            if local_path.exists() or local_path.is_symlink():
                if local_path.is_symlink():
                    local_path.unlink()
                elif local_path.is_dir():
                    shutil.rmtree(local_path)
                else:
                    local_path.unlink()
        except Exception:
            pass
        try:
            local_path.parent.mkdir(parents=True, exist_ok=True)
            os.symlink(str(drive_path), str(local_path), target_is_directory=drive_path.is_dir())
        except Exception:
            try:
                if drive_path.is_dir():
                    shutil.copytree(drive_path, local_path, dirs_exist_ok=True)
                else:
                    local_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(drive_path, local_path)
            except Exception:
                pass

def prepare_settings_file():
    """Create optimized settings for GPU mode with public sharing enabled."""
    drive_settings = DRIVE_ROOT / "settings" / "settings.yaml"
    local_settings = WORK_DIR / "user_data" / "settings.yaml"
    local_settings.parent.mkdir(parents=True, exist_ok=True)

    settings_content = f"""# MY-AI-Gizmo Settings - GPU Mode (Auto-configured)
# Network settings
listen: true
share: true
auto_launch: false

# GPU-optimized loader
loader: llama.cpp

# GPU performance settings
n_ctx: {N_CTX}
n_batch: 512
n_gpu_layers: {GPU_LAYERS}
threads: 4

# UI settings
chat_style: cai-chat
character: Assistant

# API settings
api: true
api_port: 5000
"""

    try:
        local_settings.write_text(settings_content, encoding="utf-8")
        drive_settings.parent.mkdir(parents=True, exist_ok=True)
        drive_settings.write_text(settings_content, encoding="utf-8")
        print("[‚úì] Settings configured for GPU mode with public sharing")
    except Exception as e:
        print(f"[warn] Could not update settings: {e}")

def cleanup_broken_files(drive_root: Path):
    models_dir = drive_root / "models"
    if not models_dir.exists():
        return
    extensions = ["*.gguf", "*.safetensors", "*.bin", "*.pth", "*.pt"]
    broken = []
    for ext in extensions:
        for f in models_dir.rglob(ext):
            try:
                if f.stat().st_size < (100 * 1024):
                    broken.append(f)
            except Exception:
                pass
    if broken:
        print(f"[info] Removing {len(broken)} broken/incomplete files from Drive models folder")
        for f in broken:
            try:
                f.unlink()
            except Exception:
                pass

def install_llama_cpp_python_cpu():
    """Install llama-cpp-python for CPU-only use."""
    print("\nüîß Installing llama-cpp-python for CPU...")

    env_marker = WORK_DIR / "installer_files" / "env" / "bin" / "python"
    if not env_marker.exists():
        print("[info] Virtual environment not yet created, will be handled by installer")
        return

    python_exe = str(env_marker)
    print("[info] Removing any existing llama-cpp-python installations...")
    sh(f'"{python_exe}" -m pip uninstall -y llama-cpp-python llama-cpp-python-cuda', check=False)

    cpu_env = os.environ.copy()
    cpu_env.update({
        'CMAKE_ARGS': '-DLLAMA_CUDA=OFF -DLLAMA_CUBLAS=OFF -DLLAMA_METAL=OFF -DLLAMA_OPENCL=OFF -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS',
        'FORCE_CMAKE': '1',
        'CUDACXX': '',
    })

    result = sh(
        f'"{python_exe}" -m pip install llama-cpp-python --no-cache-dir --force-reinstall --upgrade',
        env=cpu_env,
        check=False
    )

    if result.returncode == 0:
        print("[‚úì] llama-cpp-python (CPU) installed successfully")
    else:
        print(f"[warn] CPU install returned code {result.returncode}")


def install_llama_cpp_python_gpu():
    """
    Install llama-cpp-python with CUDA support.
    Skips if llama-cpp-binaries (CUDA) is already installed by the main installer.
    """
    print("\nüîß Checking llama-cpp GPU support...")

    env_marker = WORK_DIR / "installer_files" / "env" / "bin" / "python"
    if not env_marker.exists():
        print("[info] Virtual environment not yet created, will be handled by installer")
        return

    python_exe = str(env_marker)

    # ‚îÄ‚îÄ Check if llama-cpp-binaries (CUDA) is already installed ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    check = sh(f'"{python_exe}" -m pip show llama-cpp-binaries', check=False)
    if check.returncode == 0 and "cu" in check.stdout.lower():
        print("[‚úì] llama-cpp-binaries (CUDA) already installed ‚Äî skipping reinstall")
        print(f"    {[l for l in check.stdout.splitlines() if 'Version' in l or 'Name' in l]}")
        return

    # ‚îÄ‚îÄ Detect Python version for correct wheel tag ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    py_ver_result = sh(f'"{python_exe}" -c "import sys; print(f\'cp{{sys.version_info.major}}{{sys.version_info.minor}}\')"')
    py_tag = py_ver_result.stdout.strip() if py_ver_result.returncode == 0 else "cp311"
    print(f"[info] Python tag: {py_tag}")

    # ‚îÄ‚îÄ Detect CUDA version ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    cuda_major, cuda_minor = "12", "1"
    cuda_ver_result = sh("nvcc --version")
    if cuda_ver_result.returncode == 0:
        m = re.search(r'release (\d+)\.(\d+)', cuda_ver_result.stdout)
        if m:
            cuda_major, cuda_minor = m.group(1), m.group(2)
    cuda_tag = f"cu{cuda_major}{cuda_minor}"
    print(f"[info] CUDA tag: {cuda_tag}")

    # ‚îÄ‚îÄ Try installing llama-cpp-binaries with CUDA (preferred) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    print(f"[info] Trying llama-cpp-binaries (CUDA)...")
    result = sh(
        f'"{python_exe}" -m pip install llama-cpp-binaries --extra-index-url '
        f'https://abetlen.github.io/llama-cpp-python/whl/{cuda_tag} --no-cache-dir',
        check=False
    )
    if result.returncode == 0:
        print("[‚úì] llama-cpp-binaries (CUDA) installed successfully")
        return

    # ‚îÄ‚îÄ Try pre-built llama-cpp-python wheel matching our Python + CUDA ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    print("[info] Trying pre-built llama-cpp-python wheel...")
    for version in ["0.3.2", "0.2.90", "0.2.79"]:
        wheel_url = (
            f"https://github.com/abetlen/llama-cpp-python/releases/download/"
            f"v{version}/llama_cpp_python-{version}-{py_tag}-{py_tag}-linux_x86_64.whl"
        )
        result = sh(f'"{python_exe}" -m pip install "{wheel_url}" --no-cache-dir', check=False)
        if result.returncode == 0:
            print(f"[‚úì] llama-cpp-python {version} wheel installed")
            return
        print(f"[info] Wheel v{version} failed, trying next...")

    # ‚îÄ‚îÄ Compile from source with CUDA as last resort ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    print("[info] Compiling llama-cpp-python from source with CUDA (slow, ~5 min)...")
    gpu_env = os.environ.copy()
    gpu_env.update({'CMAKE_ARGS': '-DLLAMA_CUBLAS=ON -DLLAMA_CUDA=ON', 'FORCE_CMAKE': '1'})
    result = sh(
        f'"{python_exe}" -m pip install llama-cpp-python --no-cache-dir --force-reinstall',
        env=gpu_env, check=False
    )
    if result.returncode == 0:
        print("[‚úì] llama-cpp-python compiled with CUDA successfully")
    else:
        print("[warn] All GPU install attempts failed ‚Äî llama.cpp will run on CPU")
        print("       ExLlama2/Transformers loaders will still use GPU normally")

def create_llama_cpp_binaries_wrapper():
    """Create a wrapper module for llama_cpp_binaries to fix the import error."""
    print("\nüîß Creating llama_cpp_binaries compatibility wrapper...")

    modules_dir = WORK_DIR / "modules"
    wrapper_file = modules_dir / "llama_cpp_binaries.py"

    wrapper_code = '''"""
Compatibility wrapper for llama_cpp_binaries.
Provides the get_binary_path() function needed by llama_cpp_server.py
"""
import os
import shutil
from pathlib import Path

def get_binary_path():
    search_paths = []
    try:
        import llama_cpp
        llama_cpp_path = Path(llama_cpp.__file__).parent
        search_paths.append(llama_cpp_path / "bin")
    except ImportError:
        pass

    system_binary = shutil.which("llama-server") or shutil.which("llama-cpp-server")
    if system_binary:
        return system_binary

    repo_dir = Path(__file__).parent.parent / "repositories" / "llama.cpp"
    if repo_dir.exists():
        search_paths += [repo_dir / "build" / "bin", repo_dir / "build", repo_dir]

    installer_dir = Path(__file__).parent.parent / "installer_files"
    if installer_dir.exists():
        search_paths.append(installer_dir / "env" / "bin")

    for search_path in search_paths:
        if not search_path.exists():
            continue
        for binary_name in ["llama-server", "llama-cpp-server", "server"]:
            for ext in ["", ".exe"]:
                binary_path = search_path / f"{binary_name}{ext}"
                if binary_path.exists() and (os.access(binary_path, os.X_OK) or ext == ".exe"):
                    return str(binary_path)

    return "PYTHON_SERVER"

def ensure_binary():
    try:
        return get_binary_path() is not None
    except Exception:
        return False
'''

    try:
        modules_dir.mkdir(parents=True, exist_ok=True)
        wrapper_file.write_text(wrapper_code, encoding="utf-8")
        print(f"[‚úì] Created {wrapper_file}")
        return True
    except Exception as e:
        print(f"[error] Could not create wrapper: {e}")
        return False

def patch_gradio_launch():
    server_py = WORK_DIR / "server.py"
    if not server_py.exists():
        print("[warn] server.py not found, cannot patch")
        return
    try:
        content = server_py.read_text(encoding="utf-8")
        if '.launch(' in content and 'share=' not in content:
            content = re.sub(r'\.launch\((.*?)\)', r'.launch(\1, share=True)', content)
            server_py.write_text(content, encoding="utf-8")
            print("[‚úì] Patched server.py for public URL sharing")
    except Exception as e:
        print(f"[warn] Could not patch server.py: {e}")

def create_model_loader_config():
    config_file = WORK_DIR / "model-config.yaml"
    config_content = f"""# GPU-Optimized Model Loader Configuration

default:
  loader: llama.cpp
  n_gpu_layers: {GPU_LAYERS}
  n_ctx: {N_CTX}
  n_batch: 512
  threads: 4
  use_mmap: true
  use_mlock: false

*.gguf:
  loader: llama.cpp
  n_gpu_layers: {GPU_LAYERS}
  n_ctx: {N_CTX}

*.safetensors:
  loader: Transformers
  load_in_4bit: true
  use_flash_attention_2: true
"""
    try:
        config_file.write_text(config_content, encoding="utf-8")
        print(f"[‚úì] Created GPU model loader config")
    except Exception as e:
        print(f"[warn] Could not create model config: {e}")

def choose_mode():
    """Ask the user whether to use GPU or CPU and configure globals."""
    global USE_GPU, GPU_LAYERS, N_CTX

    print("\n" + "=" * 70)
    print("  üñ•Ô∏è  MY-AI-Gizmo ‚Äî Choose Your Mode")
    print("=" * 70)
    print("  [1]  GPU  ‚Äî Faster, requires CUDA-enabled GPU (Colab T4/A100)")
    print("  [2]  CPU  ‚Äî Slower, works on any machine")
    print("=" * 70)

    while True:
        choice = input("\n  Enter 1 for GPU or 2 for CPU: ").strip()
        if choice == "1":
            USE_GPU = True
            GPU_LAYERS = -1    # all layers on GPU
            N_CTX = 4096       # GPU can handle larger context
            print("\n  ‚úÖ GPU mode selected")
            print("     ‚Ä¢ n_gpu_layers : -1  (all layers on GPU)")
            print("     ‚Ä¢ n_ctx        : 4096")
            print("     ‚Ä¢ llama-cpp-python will be built with CUDA")
            break
        elif choice == "2":
            USE_GPU = False
            GPU_LAYERS = 0     # no layers on GPU
            N_CTX = 2048       # keep context smaller for CPU
            print("\n  ‚úÖ CPU mode selected")
            print("     ‚Ä¢ n_gpu_layers : 0")
            print("     ‚Ä¢ n_ctx        : 2048")
            print("     ‚Ä¢ llama-cpp-python will be built for CPU-only")
            break
        else:
            print("  ‚ö†Ô∏è  Please enter 1 or 2.")

    print("=" * 70 + "\n")

# ---------- Main flow ----------
print("=" * 70)
print("MY-AI-Gizmo Universal Launcher")
print("=" * 70)

# Ask user: GPU or CPU?
choose_mode()

# Check GPU only if user picked it
if USE_GPU:
    gpu_check = sh("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader")
    if gpu_check.returncode == 0:
        print(f"[‚úì] GPU detected: {gpu_check.stdout.strip()}")
    else:
        print("[warn] nvidia-smi failed ‚Äî make sure you selected a GPU runtime in Colab!")
        print("       Runtime ‚Üí Change runtime type ‚Üí GPU")

ensure_dirs()

if IN_COLAB:
    try:
        print("[info] Mounting Google Drive...")
        colab_drive.mount("/content/drive", force_remount=False)
        print("[‚úì] Google Drive mounted")
    except Exception as e:
        print(f"[warn] Could not mount Drive: {e}")

cleanup_broken_files(DRIVE_ROOT)

if not download_repo_if_missing() and not WORK_DIR.exists():
    raise SystemExit("Repository unavailable. Fix network/REPO_ZIP and retry.")

os.chdir(WORK_DIR)

ensure_symlinks_and_files()
prepare_settings_file()
create_model_loader_config()

MPL_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
start_sh = WORK_DIR / "start_linux.sh"
installer_log = LOG_DIR / f"installer_{int(time.time())}.log"
env_marker = WORK_DIR / "installer_files" / "env" / "bin" / "python"

install_env = os.environ.copy()
if USE_GPU:
    install_env.update({
        "MPLBACKEND": "Agg",
        "MPLCONFIGDIR": str(MPL_CONFIG_DIR),
        "GPU_CHOICE": "A",
        "LAUNCH_AFTER_INSTALL": "FALSE",
        "INSTALL_EXTENSIONS": "FALSE",
        "CMAKE_ARGS": "-DLLAMA_CUBLAS=ON -DLLAMA_CUDA=ON",
        "FORCE_CMAKE": "1",
        "SKIP_TORCH_TEST": "TRUE",
        "FORCE_CUDA": "TRUE",
    })
    print("\nüì¶ Installing dependencies (GPU mode)...")
else:
    install_env.update({
        "MPLBACKEND": "Agg",
        "MPLCONFIGDIR": str(MPL_CONFIG_DIR),
        "GPU_CHOICE": "N",
        "LAUNCH_AFTER_INSTALL": "FALSE",
        "INSTALL_EXTENSIONS": "FALSE",
        "CMAKE_ARGS": "-DLLAMA_CUDA=OFF -DLLAMA_CUBLAS=OFF -DLLAMA_METAL=OFF -DLLAMA_OPENCL=OFF",
        "FORCE_CMAKE": "1",
        "CUDA_VISIBLE_DEVICES": "",
        "CUDACXX": "",
        "SKIP_TORCH_TEST": "TRUE",
        "FORCE_CUDA": "FALSE",
    })
    print("\nüì¶ Installing dependencies (CPU mode)...")
print(f"Installer log -> {installer_log}")

if start_sh.exists():
    sh("chmod +x start_linux.sh")

    if not env_marker.exists():
        print("[info] Running installer...")
        code, url = stream_with_heartbeat(
            "bash start_linux.sh",
            cwd=str(WORK_DIR),
            env=install_env,
            logfile_path=str(installer_log),
            capture_url_to=str(PUBLIC_URL_FILE)
        )
        if code != 0:
            print(f"[warn] Installer exited with code {code}. See {installer_log}")
        else:
            print(f"[‚úì] Installer completed")
    else:
        print("[info] Virtual environment exists, skipping full install")

    if USE_GPU:
        install_llama_cpp_python_gpu()
    else:
        install_llama_cpp_python_cpu()
    create_llama_cpp_binaries_wrapper()
    patch_gradio_launch()
else:
    print("[error] start_linux.sh not found!")
    raise SystemExit("Cannot proceed without installation script")

# Build launch wrapper
launch_wrapper = WORK_DIR / "_launch_with_share.py"
mode_label = "GPU" if USE_GPU else "CPU"
cuda_disable = "" if USE_GPU else "\nos.environ['CUDA_VISIBLE_DEVICES'] = ''"
cuda_info = "[INFO] GPU mode enabled" if USE_GPU else "[INFO] CPU-only mode enabled"

launch_wrapper_code = f"""# {mode_label} launch wrapper with public sharing
import sys
import os
{cuda_disable}
os.environ['MPLBACKEND'] = 'Agg'
os.environ['MPLCONFIGDIR'] = r'{MPL_CONFIG_DIR}'
os.environ['GRADIO_SERVER_NAME'] = '0.0.0.0'
os.environ['GRADIO_SHARE'] = '1'

if '--listen' not in sys.argv:
    sys.argv.append('--listen')
if '--share' not in sys.argv:
    sys.argv.append('--share')
if '--auto-launch' not in sys.argv:
    sys.argv.append('--auto-launch')

print("[INFO] Launch flags:", ' '.join(sys.argv))
print("{cuda_info}")

try:
    import matplotlib
    matplotlib.use('Agg', force=True)
except:
    pass

import runpy
runpy.run_path('server.py', run_name='__main__')
"""

try:
    launch_wrapper.write_text(launch_wrapper_code, encoding="utf-8")
    print("[‚úì] Created launch wrapper")
except Exception as e:
    print(f"[warn] Could not create wrapper: {e}")

try:
    sh("pkill -9 -f 'python.*server.py'")
    sh("pkill -9 -f 'python.*gradio'")
except Exception:
    pass
time.sleep(2)

server_log = LOG_DIR / f"server_{int(time.time())}.log"
python_exe = str(env_marker) if env_marker.exists() else "python3"
# Use full absolute path to wrapper to avoid cwd issues
launch_cmd = f'{python_exe} -u "{str(launch_wrapper)}"'

server_env = os.environ.copy()
server_env.update({
    "MPLBACKEND": "Agg",
    "MPLCONFIGDIR": str(MPL_CONFIG_DIR),
    "GRADIO_SERVER_NAME": "0.0.0.0",
    "GRADIO_SHARE": "1",
})

print("\n" + "=" * 70)
print(f"üöÄ LAUNCHING WEB UI ({'GPU' if USE_GPU else 'CPU'} MODE)")
print("=" * 70)
print(f"Server log -> {server_log}")
if USE_GPU:
    print(f"\n‚öôÔ∏è  GPU SETTINGS:")
    print(f"  ‚Ä¢ n_gpu_layers : {GPU_LAYERS}  (-1 = all layers on GPU)")
    print(f"  ‚Ä¢ n_ctx        : {N_CTX}")
    print(f"  ‚Ä¢ Loader       : llama.cpp (CUDA build)")
    print(f"  ‚Ä¢ Tip: If you get OOM errors, re-run and choose CPU or reduce layers")
else:
    print(f"\n‚öôÔ∏è  CPU SETTINGS:")
    print(f"  ‚Ä¢ n_gpu_layers : 0")
    print(f"  ‚Ä¢ n_ctx        : {N_CTX}")
    print(f"  ‚Ä¢ Loader       : llama.cpp (CPU build)")
    print(f"  ‚Ä¢ Tip: Use small quantized GGUF models (Q4_K_M) for best speed")
print("=" * 70 + "\n")
print("‚è≥ Starting server (may take 1-2 minutes)...\n")

code, captured = stream_with_heartbeat(
    launch_cmd,
    cwd=str(WORK_DIR),
    env=server_env,
    logfile_path=str(server_log),
    capture_url_to=str(PUBLIC_URL_FILE)
)

print("\n" + "=" * 70)

# ‚îÄ‚îÄ Try to recover URL from log file if live capture missed it ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if not captured and server_log.exists():
    print("[info] Scanning server log for URL...")
    try:
        log_text = server_log.read_text(encoding="utf-8", errors="ignore")
        for pat in [
            re.compile(r'(https?://[a-zA-Z0-9\-]+\.gradio\.live[^\s,)\'\"]*)', re.IGNORECASE),
            re.compile(r'Running on public URL:\s*(https?://\S+)', re.IGNORECASE),
            re.compile(r'(https?://\S+\.gradio\.app[^\s,)\'\"]*)', re.IGNORECASE),
        ]:
            m = pat.search(log_text)
            if m:
                captured = m.group(1).rstrip(').,\'"')
                print(f"[‚úì] URL recovered from log file")
                break
    except Exception:
        pass

if captured:
    print(f"‚úÖ WEB UI READY!")
    print(f"üåê PUBLIC URL: {captured}")
    print("=" * 70)
    try:
        PUBLIC_URL_FILE.write_text(captured, encoding="utf-8")
        print(f"[‚úì] URL saved to: {PUBLIC_URL_FILE}")
    except Exception:
        pass
    print("\nüìã NEXT STEPS:")
    print("  1. Click the URL above to open the Web UI")
    print("  2. Go to the 'Model' tab")
    print("  3. Select 'llama.cpp' loader  ‚Üí  set n_gpu_layers to -1")
    print("  4. Load a GGUF model and start chatting!")
else:
    print("‚ö†Ô∏è  NO PUBLIC URL CAPTURED")
    print("=" * 70)

    # ‚îÄ‚îÄ Show the last 60 lines of server log so user can see the error ‚îÄ‚îÄ‚îÄ‚îÄ
    if server_log.exists():
        print(f"\nüìã Last 60 lines of server log ({server_log}):\n")
        try:
            lines = server_log.read_text(encoding="utf-8", errors="ignore").splitlines()
            for line in lines[-60:]:
                print(f"  {line}")
        except Exception as e:
            print(f"  [could not read log: {e}]")
    else:
        print(f"\n[warn] Server log not found: {server_log}")

    print("\nüîß COMMON FIXES:")
    print("  ‚Ä¢ If you see 'ModuleNotFoundError' ‚Üí the venv is broken, delete")
    print(f"    {WORK_DIR / 'installer_files'} and re-run the script")
    print("  ‚Ä¢ If you see 'Address already in use' ‚Üí run: pkill -9 -f server.py")
    print("  ‚Ä¢ If Gradio failed to get a public URL ‚Üí check Colab's internet access")

    if PUBLIC_URL_FILE.exists():
        try:
            saved_url = PUBLIC_URL_FILE.read_text().strip()
            if saved_url:
                print(f"\nüîó Previously saved URL (may still be live): {saved_url}")
        except Exception:
            pass

if code != 0:
    print(f"\n[warn] Server exited with code {code}")
else:
    print("\n[info] Server terminated normally")

print("\n‚úÖ Data location:", DRIVE_ROOT)
print("=" * 70)

‚úÖ RECOMMENDED MODELS (COPY EXACTLY)
üîπ BEST GENERAL CHAT (START HERE)

Llama-2-7B-Chat

Repo: TheBloke/Llama-2-7B-Chat-GGUF
File: llama-2-7b-chat.Q4_K_M.gguf

üîπ FAST + LIGHT (LOW RAM)

TinyLlama-1.1B-Chat

Repo: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
File: tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf

üîπ STRONG CHAT (BETTER THAN LLAMA-2)

Mistral-7B-Instruct

Repo: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
File: mistral-7b-instruct-v0.2.Q4_K_M.gguf

üîπ CODING MODEL

Code LLaMA-7B

Repo: TheBloke/CodeLlama-7B-GGUF
File: codellama-7b.Q4_K_M.gguf

üîπ ROLEPLAY / STORY

MythoMax-L2-13B (needs more RAM)

Repo: TheBloke/MythoMax-L2-13B-GGUF
File: mythomax-l2-13b.Q4_K_M.gguf

üîπ VERY FAST / TEST MODEL

Phi-2 (2.7B)

Repo: TheBloke/phi-2-GGUF
File: phi-2.Q4_K_M.gguf

‚öôÔ∏è WHAT LOADER TO USE (IMPORTANT)

For ALL models above:

Loader: llama.cpp


Repo: TheBloke/Llama-2-7B-Chat-GGUF
File: llama-2-7b-chat.Q4_K_M.gguf
