# LocalForge AI - Privacy-First AI Coding Assistant

---

## **Main Features**
- **DeepSeek Coder 1.3B Model**  
  *Lightweight (3GB) and **2–3× faster** than 7B models.*
- **T4 GPU Optimization**  
  *Includes **8-bit quantization** for maximum efficiency.*
- **RAG System**  
  *Powered by **Sentence Transformers** and **Cosine Similarity Search** for intelligent retrieval.*
- **Multi-format Support**  
  *Handles **PDF, DOCX, TXT, CSV, and code files** seamlessly.*
- **FastAPI Backend**  
  *Full **REST API** for easy integration.*
- **Thread Management**  
  *Persistent sessions with **SQLite** storage.*
- **File Upload & Processing**  
  *Background **embedding generation** for smooth performance.*

---



In [None]:
# ============================================================
# CELL A: Install Dependencies 
# ============================================================

print("Installing LocalForge AI dependencies...")

# Core ML & AI libraries
!pip install -q transformers accelerate bitsandbytes sentence-transformers

# Web framework & utilities
!pip install -q fastapi uvicorn "python-multipart" pyngrok

# RAG Document Processing Libraries
!pip install -q PyPDF2 python-docx pandas openpyxl

# System utilities
!pip install -q numpy torch

print("All dependencies installed successfully!")

In [None]:
# ============================================================
# CELL B: Configuration & Backend Setup
# ============================================================
import os

os.environ["BACKEND_API_KEY"] = "key123" # ⚠️ SECURITY: Change this API key to your own secure key!
os.environ["HF_MODEL"] = "deepseek-ai/deepseek-coder-1.3b-instruct"  # ⚠️ Configuration: Change this model name according to your needs!
os.environ["LOCALFORGE_BASE"] = "/kaggle/working/LocalForgeAI"
os.environ["PORT"] = "8000"

print(" Configuration:")
print(f"   Model: {os.environ['HF_MODEL']}")
print(f"   API Key: {os.environ['BACKEND_API_KEY']}")
print(f"   Storage: {os.environ['LOCALFORGE_BASE']}")
print(f"   Port: {os.environ['PORT']}\n")

# Create backend directory
BACKEND_DIR = "/kaggle/working/localforge_backend"
os.makedirs(BACKEND_DIR, exist_ok=True)

# Write main.py backend code
backend_code = r'''
# BackendCode.py
import os
import json
import time
import base64
import threading
import logging
import sqlite3
import mimetypes
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Dict, Any

from fastapi import FastAPI, UploadFile, File, Form, Header, HTTPException
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

# ============================================================
# CONFIG & ENV
# ============================================================
API_KEY = os.environ.get("BACKEND_API_KEY", "test123") # ⚠️ SECURITY: Change this API key to match your own secure key!
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct" # ⚠️ Configuration: Change this model name according to your needs!

TOP_K = 2
MAX_CONTEXT_CHUNKS = 8

DB_PATH = Path("localforge.db")
UPLOAD_DIR = Path("uploads")
UPLOAD_DIR.mkdir(exist_ok=True)
# ============================================================
# SYSTEM PROMPT
# ============================================================
SYSTEM_PROMPT = """\
You are LocalForge AI, a helpful coding and content assistant.

CORE PRINCIPLES:
1) Provide comprehensive, helpful answers that address the user's needs thoroughly
2) Be creative and flexible in your approach - suggest multiple solutions or perspectives
3) Use examples, analogies, or additional context when helpful
4) If unsure, explain what you do know and your limitations
5) Be conversational and friendly while remaining informative
6) Share related knowledge or insights that might be valuable
7) When explaining code, include comments and reasoning
8) Explore creative solutions and think outside the box
9) Provide practical, actionable advice
10) Be encouraging and supportive of the user's goals

RESPONSE STYLE:
- Conversational, natural tone
- Clear explanations with examples
- Use markdown formatting for readability
- Include relevant details, tips, or alternatives
"""

# ============================================================
# LOGGER
# ============================================================
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("localforge")

# ============================================================
# DEPENDENCIES CHECK
# ============================================================
try:
    from sentence_transformers import SentenceTransformer
    S2T_AVAILABLE = True
except ImportError:
    S2T_AVAILABLE = False

try:
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False

try:
    import PyPDF2
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False

try:
    import docx
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False

try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False

# ============================================================
# DATABASE
# ============================================================
def get_conn():
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    return conn

def init_db():
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS threads (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT,
        created_at REAL
    )""")
    cur.execute("""
    CREATE TABLE IF NOT EXISTS messages (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        thread_id INTEGER,
        role TEXT,
        content TEXT,
        created_at REAL
    )""")
    cur.execute("""
    CREATE TABLE IF NOT EXISTS files (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        filename TEXT,
        filepath TEXT,
        created_at REAL
    )""")
    cur.execute("""
    CREATE TABLE IF NOT EXISTS chunks (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        file_id INTEGER,
        thread_id INTEGER,
        doc_name TEXT,
        chunk_index INTEGER,
        text TEXT,
        vector TEXT,
        created_at REAL
    )""")
    cur.execute("""
    CREATE TABLE IF NOT EXISTS embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        doc_name TEXT,
        content TEXT,
        vector TEXT,
        created_at REAL
    )""")
    conn.commit()
    conn.close()

init_db()

# ============================================================
# UTILS
# ============================================================
def require_api_key(x_api_key: Optional[str]):
    if x_api_key != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API Key")

def cosine_sim(a: List[float], b: List[float]) -> float:
    import numpy as np
    if not a or not b:
        return 0.0
    a = np.array(a)
    b = np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))

def is_clean_text(s: str) -> bool:
    """Heuristic to skip binary-like / metadata chunks."""
    if not s or len(s.strip()) < 20:
        return False
    # Reject lines with high proportion of non-letter ASCII (xref tables, object markers)
    letters = sum(ch.isalpha() for ch in s)
    nonprint = sum((ord(ch) < 32 and ch not in "\n\t\r") for ch in s)
    if nonprint > 0:
        return False
    density = letters / max(len(s), 1)
    # Accept if some letters and density is reasonable
    return letters >= 30 and density >= 0.15

def clean_text(s: str) -> str:
    """Remove typical PDF artifacts."""
    bad_tokens = ("xref", "endobj", "obj", "/XObject", "/Subtype", "/Image")
    lines = []
    for ln in s.splitlines():
        lns = ln.strip()
        if any(tok in lns for tok in bad_tokens):
            continue
        lines.append(lns)
    return "\n".join(lines)

# ============================================================
# EMBEDDINGS
# ============================================================
_embed_model = None
_embed_lock = threading.Lock()

def get_embed_model():
    global _embed_model
    if not S2T_AVAILABLE:
        return None
    with _embed_lock:
        if _embed_model is None:
            _embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    return _embed_model

def embed_text(texts: List[str]) -> List[List[float]]:
    model = get_embed_model()
    if model:
        return model.encode(texts).tolist()
    return [[0.0] * 384 for _ in texts]

# ============================================================
# MODEL LOAD & GENERATION
# ============================================================
_model = None
_tokenizer = None
_model_lock = threading.Lock()

def load_hf_model_async():
    global _model, _tokenizer
    if not HF_AVAILABLE:
        logger.info("Transformers not available; generation disabled")
        return
    with _model_lock:
        if _model is None:
            try:
                _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
                _model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map="auto")
                logger.info("HF model loaded")
            except Exception as e:
                logger.exception("Failed to load HF model: %s", e)

threading.Thread(target=load_hf_model_async, daemon=True).start()

def extract_after_response_marker(text: str) -> str:
    """Return text after '### Response:' marker; if missing, return original."""
    marker = "### Response:"
    if marker in text:
        return text.split(marker, 1)[1].strip()
    return text.strip()

def model_generate(prompt: str, max_tokens: int = 256, temperature: float = 0.7) -> str:
    if _model is None or _tokenizer is None:
        # Even if model missing, return a graceful message
        return "Model unavailable on this server. Try Demo Mode or enable Transformers."
    try:
        inputs = _tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
        device = next(_model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = _model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            do_sample=(temperature > 0.0),
            pad_token_id=_tokenizer.eos_token_id
        )
        text = _tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Prefer extracting after marker to avoid echoing system prompt
        return extract_after_response_marker(text)
    except Exception as e:
        logger.exception("Generation error")
        return f"Generation error: {e}"

# ============================================================
# DOCUMENT PROCESSING
# ============================================================
def process_document_content(raw: bytes, filename: str) -> str:
    name = filename.lower()

    if name.endswith(".pdf") and PDF_AVAILABLE:
        try:
            reader = PyPDF2.PdfReader(BytesIO(raw))
            pages = []
            for page in reader.pages:
                txt = page.extract_text() or ""
                txt = clean_text(txt)
                if is_clean_text(txt):
                    pages.append(txt)
            return "\n\n".join(pages)
        except Exception:
            # Don't return raw binary; return empty text to avoid useless chunks
            return ""

    if name.endswith(".docx") and DOCX_AVAILABLE:
        try:
            doc = docx.Document(BytesIO(raw))
            text = "\n".join(p.text for p in doc.paragraphs)
            text = clean_text(text)
            return text if is_clean_text(text) else ""
        except Exception:
            return ""

    # Text-like fallback
    try:
        text = raw.decode(errors="ignore")
        text = clean_text(text)
        return text if is_clean_text(text) else ""
    except Exception:
        return ""

def chunk_text(text: str, chunk_size: int = 1300, overlap: int = 200) -> List[str]:
    if not text:
        return []
    chunks = []
    i = 0
    while i < len(text):
        ch = text[i:i + chunk_size]
        ch = clean_text(ch)
        if is_clean_text(ch):
            chunks.append(ch)
        i += max(chunk_size - overlap, 1)
    return chunks

# ============================================================
# RAG RETRIEVAL
# ============================================================
def get_top_k_chunks_for_query(query: str, k: int = TOP_K, file_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
    if not S2T_AVAILABLE:
        return []
    try:
        qvec = embed_text([query])[0]
    except Exception:
        return []

    conn = get_conn()
    cur = conn.cursor()

    if file_ids:
        placeholders = ",".join("?" for _ in file_ids)
        rows = cur.execute(
            f"SELECT id, file_id, doc_name, chunk_index, text, vector FROM chunks WHERE file_id IN ({placeholders}) AND vector IS NOT NULL",
            tuple(file_ids)
        ).fetchall()
    else:
        rows = cur.execute("SELECT id, file_id, doc_name, chunk_index, text, vector FROM chunks WHERE vector IS NOT NULL").fetchall()

    candidates = []
    for r in rows:
        txt = r["text"] or ""
        if not is_clean_text(txt):
            continue
        try:
            vec = json.loads(r["vector"])
            score = cosine_sim(qvec, vec)
        except Exception:
            score = 0.0
        candidates.append({
            "id": r["id"],
            "file_id": r["file_id"],
            "doc_name": r["doc_name"],
            "chunk_index": r["chunk_index"],
            "text": txt,
            "score": score
        })
    conn.close()

    # Sort & cap
    candidates = sorted(candidates, key=lambda x: x["score"], reverse=True)[:k]
    return candidates

def build_context_from_chunks(chunks: List[Dict[str, Any]]) -> str:
    parts = []
    for c in chunks:
        header = f"--- Chunk (file: {c.get('doc_name','unknown')}, idx: {c.get('chunk_index')}, score: {c.get('score',0):.4f}) ---"
        parts.append(header)
        parts.append(c.get('text', ''))
    return "\n\n".join(parts)

def conversation_history_for_thread(cur, thread_id: int, limit: int = 12) -> str:
    cur.execute("SELECT role, content FROM messages WHERE thread_id=? ORDER BY created_at DESC LIMIT ?", (thread_id, limit))
    rows = cur.fetchall()
    parts = [f"{r['role']}: {r['content']}" for r in rows[::-1]]
    return "\n".join(parts)

def build_prompt(user_prompt: str, context: Optional[str] = None) -> str:
    parts = [SYSTEM_PROMPT]
    if context:
        parts.append(context)
    parts.append(f"### Instruction:\n{user_prompt}\n\n### Response:")
    return "\n\n".join(parts)

# ============================================================
# FASTAPI APP
# ============================================================
app = FastAPI(title="LocalForge AI Backend - RAG One-file", version="5.3.0")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# ============================================================
# MODELS
# ============================================================
class ChatRequest(BaseModel):
    thread_id: Optional[int] = None
    prompt: str
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.0
    use_rag: Optional[bool] = True
    attached_files: Optional[List[Dict[str, Any]]] = []
    file_ids: Optional[List[int]] = None

class FileChatRequest(BaseModel):
    thread_id: Optional[int] = None
    prompt: str
    file_ids: List[int]
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.0
    use_rag: Optional[bool] = True
    attached_files: Optional[List[Dict[str, Any]]] = []

# ============================================================
# ENDPOINTS
# ============================================================
@app.get("/")
def root():
    return {
        "status": "online",
        "backend": "LocalForge AI One-file RAG",
        "model": MODEL_NAME,
        "version": "5.3.0",
        "features": ["embedding-rag", "chunking", "background-embeddings", "filechat", "top-k-retrieval"],
        "dependencies": {
            "transformers": HF_AVAILABLE,
            "sentence-transformers": S2T_AVAILABLE,
            "pyPDF2": PDF_AVAILABLE,
            "python-docx": DOCX_AVAILABLE,
            "pandas": PANDAS_AVAILABLE
        }
    }

@app.get("/api/health")
def health(x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*) as cnt FROM chunks")
    chunk_count = cur.fetchone()["cnt"]
    conn.close()
    hf_loaded = True if not HF_AVAILABLE else (_model is not None and _tokenizer is not None)
    return {
        "status": "ok",
        "timestamp": time.time(),
        "hf_model_loaded": hf_loaded,
        "embeddings": S2T_AVAILABLE,
        "chunk_count": chunk_count
    }

# --- Threads ---
@app.post("/api/threads")
def create_thread(title: str = Form("New Conversation"), x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("INSERT INTO threads (title, created_at) VALUES (?,?)", (title, time.time()))
    tid = cur.lastrowid
    conn.commit()
    conn.close()
    return {"id": tid, "title": title, "created_at": time.time()}

@app.get("/api/threads")
def list_threads(x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT * FROM threads ORDER BY created_at DESC")
    rows = [dict(r) for r in cur.fetchall()]
    conn.close()
    return rows

@app.get("/api/threads/{thread_id}")
def get_thread(thread_id: int, x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT * FROM threads WHERE id=?", (thread_id,))
    thread = cur.fetchone()
    if not thread:
        raise HTTPException(status_code=404, detail="Thread not found")
    cur.execute("SELECT * FROM messages WHERE thread_id=? ORDER BY created_at", (thread_id,))
    messages = [dict(r) for r in cur.fetchall()]
    conn.close()
    return {"thread": dict(thread), "messages": messages}

# ============================================================
# FILE MANAGEMENT
# ============================================================
@app.post("/api/upload")
async def upload_file(file: UploadFile = File(...), x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    ts = int(time.time())
    stored_name = f"{ts}_{file.filename}"
    stored_path = str(UPLOAD_DIR / stored_name)

    contents = await file.read()
    with open(stored_path, "wb") as fh:
        fh.write(contents)

    conn = get_conn()
    cur = conn.cursor()
    cur.execute("INSERT INTO files (filename, filepath, created_at) VALUES (?,?,?)", (file.filename, stored_path, time.time()))
    file_id = cur.lastrowid
    conn.commit()

    # process text and chunks (skip junk)
    try:
        text = process_document_content(contents, file.filename)
        chunks = chunk_text(text)
        for idx, chunk in enumerate(chunks):
            cur.execute(
                "INSERT INTO chunks (file_id, doc_name, chunk_index, text, vector, created_at) VALUES (?,?,?,?,?,?)",
                (file_id, file.filename, idx, chunk, None, time.time())
            )
        conn.commit()
        inserted_chunks = len(chunks)
    except Exception:
        logger.exception("Failed to extract/chunk uploaded file")
        inserted_chunks = 0
    conn.close()

    # spawn background embedding if possible
    if S2T_AVAILABLE and inserted_chunks:
        def bg_job():
            try:
                c = get_conn()
                cur2 = c.cursor()
                rows = cur2.execute("SELECT id, text FROM chunks WHERE file_id=? ORDER BY chunk_index", (file_id,)).fetchall()
                for r in rows:
                    vec = embed_text([r["text"]])[0]
                    cur2.execute("UPDATE chunks SET vector=? WHERE id=?", (json.dumps(vec), r["id"]))
                c.commit()
                c.close()
            except Exception:
                logger.exception("Background embedding job failed")
        threading.Thread(target=bg_job, daemon=True).start()

    return {"id": file_id, "filename": file.filename, "text_extracted": inserted_chunks > 0, "embedding_job": "started" if inserted_chunks else "none"}

@app.get("/api/files")
def list_files(x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT id, filename, created_at FROM files ORDER BY created_at DESC")
    rows = [dict(r) for r in cur.fetchall()]
    conn.close()
    return rows

@app.get("/api/files/{file_id}/content")
def get_file_content(file_id: int, x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    row = cur.execute("SELECT filename, filepath FROM files WHERE id=?", (file_id,)).fetchone()
    conn.close()

    if not row:
        raise HTTPException(status_code=404, detail="File not found")

    filename, filepath = row["filename"], row["filepath"]
    if not os.path.exists(filepath):
        raise HTTPException(status_code=404, detail="File missing on disk")

    ext = Path(filepath).suffix.lower()
    if ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".xml"]:
        with open(filepath, "rb") as fh:
            return fh.read().decode(errors="ignore")

    return FileResponse(filepath, media_type=mimetypes.guess_type(filepath)[0] or "application/octet-stream", filename=filename)

@app.post("/api/files/{file_id}/delete")
def delete_file(file_id: int, x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT filepath FROM files WHERE id=?", (file_id,))
    row = cur.fetchone()
    if row and os.path.exists(row["filepath"]):
        try:
            os.remove(row["filepath"])
        except Exception:
            logger.exception("Failed to remove file from disk")
    cur.execute("DELETE FROM chunks WHERE file_id=?", (file_id,))
    cur.execute("DELETE FROM files WHERE id=?", (file_id,))
    conn.commit()
    conn.close()
    return {"deleted_file_id": file_id}

# ============================================================
# RAG & CHAT
# ============================================================
@app.post("/api/chat")
def chat(req: ChatRequest, x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()

    # Create thread if missing
    if not req.thread_id:
        cur.execute("INSERT INTO threads (title, created_at) VALUES (?,?)", (req.prompt[:80], time.time()))
        req.thread_id = cur.lastrowid

    # Store user message
    cur.execute("INSERT INTO messages (thread_id, role, content, created_at) VALUES (?,?,?,?)",
                (req.thread_id, "user", req.prompt, time.time()))
    conn.commit()

    conv_ctx = conversation_history_for_thread(cur, req.thread_id, limit=6)

    combined_chunks: List[Dict[str, Any]] = []
    
    if S2T_AVAILABLE:
        try:
            combined_chunks = get_top_k_chunks_for_query(req.prompt, k=TOP_K, file_ids=req.file_ids)
        except Exception:
            logger.exception("filechat retrieval failed")


    # attach files (base64) if provided
    if req.attached_files:
        try:
            qvec = embed_text([req.prompt])[0] if S2T_AVAILABLE else None
        except Exception:
            qvec = None
        for f in req.attached_files:
            raw = base64.b64decode(f.get("content", ""))
            text = process_document_content(raw, f.get("name", "attached"))
            chs = chunk_text(text)
            if S2T_AVAILABLE and qvec is not None:
                vecs = embed_text(chs)
                for idx, (txt, vec) in enumerate(zip(chs, vecs)):
                    score = cosine_sim(qvec, vec)
                    combined_chunks.append({"id": None, "file_id": None, "doc_name": f.get("name", "attached"),
                                            "chunk_index": idx, "text": txt, "score": score})
            else:
                for idx, txt in enumerate(chs):
                    combined_chunks.append({"id": None, "file_id": None, "doc_name": f.get("name", "attached"),
                                            "chunk_index": idx, "text": txt, "score": 0.0})

    # STRONG DEDUPE after merging
    dedup_map: Dict[tuple, Dict[str, Any]] = {}
    for c in combined_chunks:
        key = (c.get("id"), c.get("doc_name"), c.get("chunk_index"))
        # prefer higher score if duplicates occur
        if key not in dedup_map or (c.get("score", 0) > dedup_map[key].get("score", 0)):
            dedup_map[key] = c
    combined_chunks = sorted(dedup_map.values(), key=lambda x: x.get("score", 0), reverse=True)[:MAX_CONTEXT_CHUNKS]

    context_text = build_context_from_chunks(combined_chunks) if combined_chunks else ""
    full_context_parts = []
    if conv_ctx.strip():
        full_context_parts.append("### Conversation History:\n" + conv_ctx)
    if context_text.strip():
        full_context_parts.append("### Retrieved Documents:\n" + context_text)
    final_context = "\n\n".join(full_context_parts).strip() if full_context_parts else None

    final_prompt = build_prompt(req.prompt, context=final_context)
    reply = model_generate(final_prompt, max_tokens=req.max_tokens or 512, temperature=req.temperature or 0.0)

    # Store assistant reply
    cur.execute("INSERT INTO messages (thread_id, role, content, created_at) VALUES (?,?,?,?)",
                (req.thread_id, "assistant", reply, time.time()))
    conn.commit()
    conn.close()

    return {
        "thread_id": req.thread_id,
        "answer": reply,
        "rag_used": bool(combined_chunks),
        "files_processed": len(set(c.get("file_id") for c in combined_chunks if c.get("file_id") is not None)),
        "files_attached": len(req.attached_files) if req.attached_files else 0,
        "rag_details": f"Retrieved {len(combined_chunks)} clean chunks"
    }

@app.post("/api/filechat")
def filechat(req: FileChatRequest, x_api_key: Optional[str] = Header(None)):
    require_api_key(x_api_key)
    conn = get_conn()
    cur = conn.cursor()

    # Create thread if missing
    if not req.thread_id:
        cur.execute("INSERT INTO threads (title, created_at) VALUES (?,?)", (req.prompt[:80], time.time()))
        req.thread_id = cur.lastrowid

    # Store user message
    cur.execute("INSERT INTO messages (thread_id, role, content, created_at) VALUES (?,?,?,?)",
                (req.thread_id, "user", req.prompt, time.time()))
    conn.commit()

    # Conversation history
    conv_ctx = conversation_history_for_thread(cur, req.thread_id, limit=6)

    # Extract text from attached files (no embeddings)
    file_texts = []
    if req.attached_files:
        for f in req.attached_files:
            raw = base64.b64decode(f.get("content", ""))
            text = process_document_content(raw, f.get("name", "attached"))
            file_texts.append(text)

    combined_file_text = "\n\n".join(file_texts).strip() if file_texts else ""

    # Build final prompt: user prompt + conversation history + extracted file text
    full_context_parts = []
    if conv_ctx.strip():
        full_context_parts.append("### Conversation History:\n" + conv_ctx)
    if combined_file_text:
        full_context_parts.append("### Attached File Content:\n" + combined_file_text)
    
    final_context = "\n\n".join(full_context_parts).strip() if full_context_parts else None
    final_prompt = build_prompt(req.prompt, context=final_context)

    # Generate model reply
    reply = model_generate(final_prompt, max_tokens=req.max_tokens or 512, temperature=req.temperature or 0.0)

    # Store assistant reply
    cur.execute("INSERT INTO messages (thread_id, role, content, created_at) VALUES (?,?,?,?)",
                (req.thread_id, "assistant", reply, time.time()))
    conn.commit()
    conn.close()

    return {
        "thread_id": req.thread_id,
        "answer": reply,
        "files_attached": len(req.attached_files) if req.attached_files else 0
    }

# Startup log
logger.info("LocalForge AI One-file RAG backend ready. DB: %s, Upload dir: %s", DB_PATH, UPLOAD_DIR)
'''

with open(f"{BACKEND_DIR}/main.py", "w") as f:
    f.write(backend_code)

print(f"\n Backend code written to {BACKEND_DIR}/main.py")
print(f" File size: {len(backend_code)} bytes\n")

In [None]:
# ============================================================
# CELL C: Start Server with Tunnel and Capture Output
# ============================================================

import os
import subprocess
import time
import sys
import threading
import requests
from pyngrok import ngrok
import torch

# Move to backend directory
BACKEND_DIR = "/kaggle/working/localforge_backend"
if os.path.exists(BACKEND_DIR):
    os.chdir(BACKEND_DIR)
else:
    print(f"Warning: Directory {BACKEND_DIR} not found. Running in current dir.")

PORT = os.environ.get("PORT", "8000")
API_KEY = os.environ.get("BACKEND_API_KEY", "test_key") 
HF_MODEL = os.environ.get("HF_MODEL", "default_model")

print("\n" + "="*70)
print(" STARTING LocalForge AI SERVER")
print("="*70)

# Configure ngrok
NGROK_TOKEN = "PlaceHolder"   # ⚠️ Configuration: Change this token to your own NGROK Token!
ngrok.set_auth_token(NGROK_TOKEN)

# Check GPU
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f" GPU: {torch.cuda.get_device_name(0)}")
    print(f" Memory: {gpu_memory:.2f} GB")
    print(f" T4 optimizations: ACTIVE")
else:
    print("  No GPU detected")

print(f"\n Model: {HF_MODEL}")
print(f" Port: {PORT}")

# Start FastAPI server
print(f"\n Starting FastAPI server...\n")
uvicorn_proc = subprocess.Popen(
    ["python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", PORT],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1 
)

# Stream output function
def stream_output(proc):
    try:
        # Iterate over stdout line by line
        for line in iter(proc.stdout.readline, ""):
            print(line, end="")
            sys.stdout.flush() # Force Kaggle to print immediately
    except Exception as e:
        print(f" Output streaming error: {e}")

# Start the logging thread
log_thread = threading.Thread(target=stream_output, args=(uvicorn_proc,), daemon=True)
log_thread.start()

# Wait for server to initialize
time.sleep(5)

# Start ngrok tunnel
try:
    public_url = ngrok.connect(PORT)
    print(f"\n" + "="*70)
    print("  LocalForge AI Server IS READY!")
    print("="*70)
    print(f"\n Copy this URL for your VS Code extension:")
    print(f" Public URL: {public_url}")
    print(f" API Key: {API_KEY}")
    print("\n Keep this cell running!")
    print("="*70)
except Exception as e:
    print(f"Ngrok error: {e}")

# Health check function
def check_server_health(retries=5, delay=3):
    print("\n Performing Health Check...")
    for i in range(retries):
        try:
            response = requests.get(
                f"http://localhost:{PORT}/api/health",
                headers={"x-api-key": API_KEY},
                timeout=10
            )
            if response.status_code == 200:
                health = response.json()
                print(f" Health Check Passed!")
                print(f"   Status: {health.get('status')}")
                print(f"   Model Loaded: {health.get('hf_model_loaded')}")
                return
            else:
                print(f" Health check failed: {response.status_code}")
        except Exception as e:
            pass 
        time.sleep(delay)
    print("  Health check failed after retries (Server might still be loading model)")

check_server_health()

# ============================================================
# BLOCKING LOOP
# ============================================================
print("\nLogs are streaming below. Stop cell to terminate server.\n")

try:
    while uvicorn_proc.poll() is None:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping server...")
    uvicorn_proc.terminate()
    ngrok.kill()
    print("Server stopped.")