# 🧪 Chatbot RAG + S3 + Telegram
**Listo para Google Colab**

### Novedades clave
- **LLM seleccionable**: OpenAI *o* Gemini. Si no hay API key, cae en *modo fragmento más relevante*.
- **Persistencia real** en S3: índice FAISS guardado/cargado por equipo + reconstrucción desde `docs/`.
- **Telegram completo**: los clientes escriben al bot, el admin ve todos los chats, activa/desactiva **Auto-responder** y puede **intervenir** en cualquier momento.
- **Transcripciones** por `chat_id` en `mini_chatbot_work/logs/`.


## 0) Instalación

In [1]:
# %%capture
!pip -q install sentence-transformers faiss-cpu pypdf gradio boto3 openai==1.* tiktoken requests google-generativeai


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## 1) Configuración

In [2]:
# A) Entrada interactiva
import os
from getpass import getpass
from google.colab import userdata

# --- AWS / S3 ---
AWS_REGION = os.getenv("AWS_REGION", "us-east-2")
S3_BUCKET  = os.getenv("S3_BUCKET", "talentotech2025")
S3_PREFIX  = os.getenv("S3_PREFIX", "IA-Innovador/")  # prefijo base del curso

# --- LLM ---
LLM_PROVIDER = (os.getenv("LLM_PROVIDER") or "openai").lower()   # "openai" o "gemini"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or userdata.get("OPENAI_API_KEY")
OPENAI_MODEL   = os.getenv("OPENAI_MODEL")   or "gpt-4o-mini"
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or userdata.get("GOOGLE_API_KEY")
GEMINI_MODEL   = os.getenv("GEMINI_MODEL")   or "gemini-1.5-flash"


# Pide claves si faltan (opcional)
AWS_ACCESS_KEY_ID     = os.getenv("AWS_ACCESS_KEY_ID")     or input("AWS_ACCESS_KEY_ID: ").strip()
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") or getpass("AWS_SECRET_ACCESS_KEY (oculto): ").strip()
AWS_SESSION_TOKEN     = os.getenv("AWS_SESSION_TOKEN")     # opcional (solo STS/Academy)

# Persistir en entorno
env = {
"AWS_REGION":AWS_REGION, "S3_BUCKET":S3_BUCKET, "S3_PREFIX":S3_PREFIX,
"AWS_ACCESS_KEY_ID":AWS_ACCESS_KEY_ID, "AWS_SECRET_ACCESS_KEY":AWS_SECRET_ACCESS_KEY
}
if AWS_SESSION_TOKEN: env["AWS_SESSION_TOKEN"]=AWS_SESSION_TOKEN
for k,v in env.items(): os.environ[k]=v

# LLM env (deja vacío si no tienes)
os.environ["LLM_PROVIDER"]=LLM_PROVIDER
if OPENAI_API_KEY: os.environ["OPENAI_API_KEY"]=OPENAI_API_KEY
os.environ["OPENAI_MODEL"]=OPENAI_MODEL
if GOOGLE_API_KEY: os.environ["GOOGLE_API_KEY"]=GOOGLE_API_KEY
os.environ["GEMINI_MODEL"]=GEMINI_MODEL

print("✔ AWS:", AWS_REGION, S3_BUCKET, S3_PREFIX)

if OPENAI_API_KEY or GOOGLE_API_KEY:
    print("✅ API Key(s) cargada(s) correctamente.")
    print("✔ LLM provider:", LLM_PROVIDER, "| OpenAI model:", OPENAI_MODEL, "| Gemini model:", GEMINI_MODEL)
else:
    print("⚠️ No se detectaron API Keys. El chat funcionará en modo 'fragmento más relevante'.")
    print("✔ LLM provider: Fragmento más relevante")

AWS_ACCESS_KEY_ID: AKIAQNBYZOKR56S2G57T
AWS_SECRET_ACCESS_KEY (oculto): ··········
✔ AWS: us-east-2 talentotech2025 IA-Innovador/
✅ API Key(s) cargada(s) correctamente.
✔ LLM provider: openai | OpenAI model: gpt-4o-mini | Gemini model: gemini-1.5-flash


## 2) S3 helpers (autoregión, listar carpetas/archivos, sync, prefijo efectivo)

In [3]:
import os, boto3, datetime, re, json, time, threading, requests
from pathlib import Path
from botocore.exceptions import ClientError

def norm_prefix(p: str) -> str:
    if p is None: return ""
    p = p.strip().replace("\\","/")
    p = p.lstrip("/")
    if p and not p.endswith("/"):
        p += "/"
    return p

def get_bucket_region(bucket: str) -> str:
    s3g = boto3.client("s3")
    loc = s3g.get_bucket_location(Bucket=bucket).get("LocationConstraint")
    return "us-east-1" if loc in (None, "EU") else loc

def s3_client_autoregion(bucket: str):
    try:
        region = get_bucket_region(bucket)
    except Exception:
        region = os.getenv("AWS_REGION", "us-east-2")
    return boto3.client("s3", region_name=region)

def s3_list_objects(bucket: str, prefix: str, delimiter: str=None):
    s3 = s3_client_autoregion(bucket)
    kwargs = {"Bucket": bucket, "Prefix": norm_prefix(prefix)}
    if delimiter: kwargs["Delimiter"] = delimiter
    keys = []
    folders = []
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(**kwargs):
        for obj in page.get("Contents", []) or []:
            keys.append(obj["Key"])
        for cp in page.get("CommonPrefixes", []) or []:
            folders.append(cp["Prefix"])
    return keys, folders

def s3_list_immediate_folders(bucket: str, base_prefix: str):
    _, folders = s3_list_objects(bucket, norm_prefix(base_prefix), delimiter="/")
    return sorted({ f.split("/")[-2] for f in folders }) if folders else []

def s3_sync_docs_to_local(bucket: str, prefix_docs: str, local_folder: str):
    s3 = s3_client_autoregion(bucket)
    prefix_docs = norm_prefix(prefix_docs)
    Path(local_folder).mkdir(parents=True, exist_ok=True)
    count = 0
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix_docs):
        for obj in page.get("Contents", []) or []:
            key = obj["Key"]
            if key.endswith("/"): continue
            rel = key[len(prefix_docs):]
            out = Path(local_folder)/rel
            out.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, key, str(out))
            count += 1
    return count

def s3_sync_local_docs_to_s3(bucket: str, prefix_docs: str, local_folder: str):
    s3 = s3_client_autoregion(bucket)
    prefix_docs = norm_prefix(prefix_docs)
    count = 0
    for root, _, files in os.walk(local_folder):
        for name in files:
            full = Path(root)/name
            rel = Path(full).relative_to(local_folder).as_posix()
            key = prefix_docs + rel
            s3.upload_file(str(full), bucket, key)
            count += 1
    return count

print("✔ Helpers S3 OK")

✔ Helpers S3 OK


## 3) Núcleo RAG (loaders → chunking → FAISS → retrieval)

In [4]:
import os, json, uuid, shutil
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple
import pandas as pd
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import numpy as np, faiss, time

BASE_DIR = Path.cwd() / "mini_chatbot_work"
DOCS_DIR = BASE_DIR / "docs_raw"
INDEX_DIR= BASE_DIR / "faiss_index"
LOGS_DIR = BASE_DIR / "logs"
META_PATH= BASE_DIR / "docs_metadata.json"
for p in [BASE_DIR, DOCS_DIR, INDEX_DIR, LOGS_DIR]: p.mkdir(parents=True, exist_ok=True)

def load_txt(p: Path)->str:
    try:
        return p.read_text(encoding="utf-8", errors="ignore")
    except Exception as e:
        return f"[ERROR TXT] {e}"

def load_pdf(p: Path)->str:
    try:
        r=PdfReader(str(p))
        return "\n".join([(pg.extract_text() or "") for pg in r.pages])
    except Exception as e:
        return f"[ERROR PDF] {e}"

def load_csv(p: Path, n=1500)->str:
    try: df=pd.read_csv(p, nrows=n)
    except UnicodeDecodeError: df=pd.read_csv(p, nrows=n, encoding="latin-1")
    return "\n".join([f"# CSV: {p.name}", f"Columnas: {list(df.columns)}", "Muestra:\n"+df.head(20).to_markdown(index=False)])

def load_any(p: Path)->str:
    ext=p.suffix.lower()
    if ext in [".txt",".md"]: return load_txt(p)
    if ext==".pdf": return load_pdf(p)
    if ext==".csv": return load_csv(p)
    return f"[BINARIO] {p.name} (no indexado)"

@dataclass
class ChunkedDoc:
    doc_id: str; source_name: str; chunk_id: int; text: str

def chunk_text(text:str, chunk_size:int=800, overlap:int=150)->List[str]:
    toks=text.split(); out=[]; i=0
    step=max(1, chunk_size-overlap)
    while i < len(toks):
        out.append(" ".join(toks[i:i+chunk_size]))
        i+=step
    return out

EMB_MODEL_NAME="sentence-transformers/all-MiniLM-L6-v2"
_emb=None; _index=None; _chunks:List[ChunkedDoc]=[]

def get_model():
    global _emb
    if _emb is None: _emb=SentenceTransformer(EMB_MODEL_NAME)
    return _emb

def build_index_from_local(paths:List[Path], chunk_size=800, overlap=150):
    global _index, _chunks
    metas=[]; chunks=[]
    for p in paths:
        raw=load_any(p); did=str(uuid.uuid4())
        metas.append({"doc_id":did,"source_name":p.name,"path":str(p)})
        for i,ch in enumerate(chunk_text(raw,chunk_size,overlap)):
            chunks.append(ChunkedDoc(doc_id=did, source_name=p.name, chunk_id=i, text=ch))
    _chunks=chunks
    X=get_model().encode([c.text for c in chunks], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    idx=faiss.IndexFlatIP(X.shape[1]); idx.add(X); _index=idx
    INDEX_DIR.mkdir(parents=True, exist_ok=True)
    faiss.write_index(_index, str(INDEX_DIR/"faiss.index"))
    json.dump([c.__dict__ for c in chunks], open(INDEX_DIR/"chunks.json","w",encoding="utf-8"), ensure_ascii=False)
    json.dump(metas, open(META_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2)
    return len(chunks), len(metas)

def load_index_local()->bool:
    global _index, _chunks
    fidx=INDEX_DIR/"faiss.index"; fch=INDEX_DIR/"chunks.json"
    if not (fidx.exists() and fch.exists()): return False
    _index=faiss.read_index(str(fidx))
    data=json.load(open(fch,"r",encoding="utf-8"))
    _chunks=[ChunkedDoc(**d) for d in data]
    return True

def retrieve(q:str, top_k:int=4)->List[Tuple[float,ChunkedDoc]]:
    if _index is None or not _chunks: return []
    X=get_model().encode([q], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    D,I=_index.search(X, top_k)
    out=[]
    for s,idx in zip(D[0],I[0]):
        if idx<0: continue
        out.append((float(s), _chunks[idx]))
    return out

def log_event(chat_id: str, role: str, text: str):
    p = LOGS_DIR / f"{chat_id}.jsonl"
    rec = {"t": time.time(), "role": role, "text": text}
    with open(p, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("✔ Núcleo RAG OK")

✔ Núcleo RAG OK


## 4) LLM wrapper (OpenAI o Gemini)

In [5]:
import os
from typing import Optional

SYS_PROMPT = (
    "Eres un asistente para preguntas y respuestas basado en archivos cargados por el usuario. "
    "Responde SOLO con la información del contexto. "
    "Si la respuesta no está en el contexto di: 'No encuentro esa información en mis archivos'. "
    "Responde en español y añade una sección 'Fuentes'."
)

def format_context(hits):
    lines = []
    for score, ch in hits:
        snippet = (ch.text[:350] + "…") if len(ch.text) > 350 else ch.text
        lines.append(f"[{ch.source_name} | score={score:.3f}] {snippet}")
    return "\n\n".join(lines)

class LLMClient:
    def __init__(self):
        self.provider = (os.getenv("LLM_PROVIDER") or "openai").lower()
        self.ok_openai = bool(os.getenv("OPENAI_API_KEY"))
        self.ok_gemini = bool(os.getenv("GOOGLE_API_KEY"))
        self.oai_model = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
        self.gem_model = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
        self._oai = None
        self._gem = None

    def _ensure_openai(self):
        if not self.ok_openai: return False
        if self._oai is None:
            from openai import OpenAI
            self._oai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        return True

    def _ensure_gemini(self):
        if not self.ok_gemini: return False
        if self._gem is None:
            import google.generativeai as genai
            genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
            self._gem = genai.GenerativeModel(self.gem_model)
        return True

    def generate(self, question: str, hits, temperature: float=0.2, max_tokens: int=400) -> str:
        ctx = format_context(hits)
        if self.provider == "openai" and self._ensure_openai():
            msgs = [
                {"role":"system","content": SYS_PROMPT},
                {"role":"user","content": f"Pregunta: {question}\n\nContexto:\n{ctx}\n\nResponde SOLO con lo anterior."}
            ]
            resp = self._oai.chat.completions.create(model=self.oai_model, messages=msgs,
                                                     temperature=float(temperature), max_tokens=int(max_tokens))
            return resp.choices[0].message.content.strip()
        if self.provider == "gemini" and self._ensure_gemini():
            prompt = f"{SYS_PROMPT}\n\nPregunta: {question}\n\nContexto:\n{ctx}\n\nResponde SOLO con lo anterior."
            out = self._gem.generate_content(prompt)
            return (out.text or "").strip()
        # Fallback sin LLM
        best = hits[0][1].text if hits else "No hay índice cargado."
        return "⚠️ Sin LLM: fragmento más relevante:\n\n" + best[:1000]

LLM = LLMClient()

def rag_answer(question: str, top_k:int=4, temperature:float=0.2):
    hits = retrieve(question, top_k=top_k)
    if not hits:
        return "Primero crea/carga un índice.", []
    txt = LLM.generate(question, hits, temperature)
    sources = [h[1].source_name for h in hits]
    return txt, sources

## 5) Cargar/Guardar índice en S3 + reconstrucción desde docs en S3

In [6]:
def effective_team_prefix(base_prefix: str, team_folder: str):
    return norm_prefix(base_prefix) + norm_prefix(team_folder)

def s3_upload_index(bucket: str, base_prefix: str, team_folder: str):
    idx_prefix_root = effective_team_prefix(base_prefix, team_folder)
    idx_prefix_sub  = idx_prefix_root + "index/"
    if not (INDEX_DIR/"faiss.index").exists() or not (INDEX_DIR/"chunks.json").exists():
        return "❌ No hay índice local (faiss.index / chunks.json). Construye primero."
    s3=s3_client_autoregion(bucket)
    for target_prefix in [idx_prefix_root, idx_prefix_sub]:
        for name in ["faiss.index","chunks.json"]:
            s3.upload_file(str(INDEX_DIR/name), bucket, f"{target_prefix}{name}")
        if META_PATH.exists():
            s3.upload_file(str(META_PATH), bucket, f"{target_prefix}docs_metadata.json")
    return f"☁️ Subido a: s3://{bucket}/{idx_prefix_root}  y  s3://{bucket}/{idx_prefix_sub}"

def _download_index_from_prefix(bucket: str, prefix: str):
    s3=s3_client_autoregion(bucket)
    found=set()
    want=["faiss.index","chunks.json","docs_metadata.json"]
    keys,_ = s3_list_objects(bucket, prefix)
    for key in keys:
        base = key.split("/")[-1]
        if base in want:
            out = INDEX_DIR / base if base!="docs_metadata.json" else META_PATH
            out.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, key, str(out))
            found.add(base)
    return found

def s3_download_index(bucket: str, base_prefix: str, team_folder: str):
    idx_prefix_root = effective_team_prefix(base_prefix, team_folder)
    idx_prefix_sub  = idx_prefix_root + "index/"
    found = _download_index_from_prefix(bucket, idx_prefix_root)
    if not {"faiss.index","chunks.json"}.issubset(found):
        found = _download_index_from_prefix(bucket, idx_prefix_sub)
    if {"faiss.index","chunks.json"}.issubset(found):
        ok = load_index_local()
        return "📥 Índice cargado." if ok else "❌ Descargado pero falló carga local."
    else:
        return f"❌ No encontré índice en {idx_prefix_root} ni {idx_prefix_sub}."

def s3_rebuild_from_docs(bucket: str, base_prefix: str, team_folder: str, chunk_size=800, overlap=150):
    docs_prefix = effective_team_prefix(base_prefix, team_folder) + "docs/"
    count = s3_sync_docs_to_local(bucket, docs_prefix, str(DOCS_DIR))
    if count == 0:
        return "❌ No hay documentos en S3 (carpeta 'docs/'). Sube alguno primero."
    n_chunks, n_docs = build_index_from_local(list(Path(DOCS_DIR).glob('*')), chunk_size, overlap)
    return f"✅ Reconstruido desde S3: {n_docs} docs → {n_chunks} chunks."

def s3_upload_local_docs(bucket: str, base_prefix: str, team_folder: str):
    docs_prefix = effective_team_prefix(base_prefix, team_folder) + "docs/"
    count = s3_sync_local_docs_to_s3(bucket, docs_prefix, str(DOCS_DIR))
    return f"☁️ Subidos {count} archivo(s) a s3://{bucket}/{docs_prefix}"

def s3_download_docs(bucket: str, base_prefix: str, team_folder: str):
    docs_prefix = effective_team_prefix(base_prefix, team_folder) + "docs/"
    count = s3_sync_docs_to_local(bucket, docs_prefix, str(DOCS_DIR))
    return f"📥 Descargados {count} archivo(s) a {DOCS_DIR}"

## 6) Telegram: bot con polling, auto-responder y panel admin

In [None]:
#import os
#os.environ["TELEGRAM_BOT_TOKEN"] = "7526191718:AAEM1HYYoKmXjmw-VkTI0kzI0bTiKBvT4i0"
#TELEGRAM_BOT_TOKEN = os.environ["TELEGRAM_BOT_TOKEN"]

In [7]:
# === LEGACY TELEGRAM MANAGER DISABLED: preserved for reference ===
if False:
    STATE={
      "bucket": os.getenv("S3_BUCKET"),
      "base_prefix": os.getenv("S3_PREFIX"),
      "team_folder": "",
      "tg": {"token":"", "bot":"", "chat_id":"", "poll":False, "offset":None, "auto":False},
      "chats": {}  # chat_id -> {"title": str, "last_text": str}
    }

    def tg_get_me(token:str):
        r=requests.get(f"https://api.telegram.org/bot{token}/getMe", timeout=10); r.raise_for_status(); return r.json()

    def tg_get_updates(token:str, offset=None, timeout=20):
        params = {"timeout": timeout}
        if offset is not None: params["offset"] = offset
        r=requests.get(f"https://api.telegram.org/bot{token}/getUpdates", params=params, timeout=timeout+5)
        r.raise_for_status(); return r.json()

    def tg_send_message(token:str, chat_id:str, text:str):
        r=requests.post(f"https://api.telegram.org/bot{token}/sendMessage",
                        json={"chat_id": chat_id, "text": text}, timeout=10)
        r.raise_for_status(); return r.json()

    def apply_route(bucket, base_prefix, team_folder):
        STATE["bucket"] = (bucket or "").strip() or os.getenv("S3_BUCKET")
        STATE["base_prefix"] = norm_prefix(base_prefix or os.getenv("S3_PREFIX") or "")
        STATE["team_folder"] = (team_folder or "").strip()
        # ensure subfolders
        eff_root = effective_team_prefix(STATE["base_prefix"], STATE["team_folder"])
        s3 = s3_client_autoregion(STATE["bucket"])
        s3.put_object(Bucket=STATE["bucket"], Key=eff_root+"docs/")
        s3.put_object(Bucket=STATE["bucket"], Key=eff_root+"index/")
        return f"✔ Ruta: s3://{STATE['bucket']}/{eff_root}"

    def set_token(token):
        STATE["tg"]["token"]=token.strip()
        me = tg_get_me(STATE["tg"]["token"])
        STATE["tg"]["bot"]= "@"+(me.get("result",{}).get("username",""))
        return f"Bot: {STATE['tg']['bot']}"

    def get_chat_id():
        upd = tg_get_updates(STATE["tg"]["token"], timeout=5)
        res = upd.get("result", [])
        last = res[-1] if res else None
        if not last: return "Envía /start al bot y reintenta."
        chat = ((last.get("message") or {}).get("chat")) or {}
        if "id" in chat:
            STATE["tg"]["chat_id"] = str(chat["id"])
            title = chat.get("username") or chat.get("title") or ""
            STATE["chats"][STATE["tg"]["chat_id"]] = {"title": title, "last_text": ""}
            return f"chat_id={STATE['tg']['chat_id']} ({title})"
        return "No se encontró chat_id."

    def list_chats():
        if not STATE["chats"]:
            return "(sin chats aún — cuando lleguen mensajes se listarán aquí)"
        lines=[]
        for cid, meta in STATE["chats"].items():
            lines.append(f"{cid} | {meta.get('title','')} | {meta.get('last_text','')[:60]}")
        return "\n".join(lines)

    def admin_send(cid: str, text: str):
        if not cid: return "Elige un chat_id."
        try:
            tg_send_message(STATE["tg"]["token"], str(cid), text or "(mensaje vacío)")
            log_event(str(cid), "admin", text or "")
            return "✅ Enviado"
        except Exception as e:
            return f"❌ Error: {e}"

    def _handle_incoming(msg):
        chat = msg.get("message", {}).get("chat", {}) or {}
        cid = str(chat.get("id"))
        text = (msg.get("message", {}) or {}).get("text", "")
        title = chat.get("username") or chat.get("title") or ""
        if not cid or not text: return
        STATE["chats"].setdefault(cid, {"title": title, "last_text": ""})
        STATE["chats"][cid]["last_text"] = text
        log_event(cid, "user", text)
        if STATE["tg"]["auto"]:
            if not load_index_local():
                s3_download_index(STATE["bucket"], STATE["base_prefix"], STATE["team_folder"])
            ans, _src = rag_answer(text, top_k=4, temperature=0.2)
            try:
                tg_send_message(STATE["tg"]["token"], cid, ans)
                log_event(cid, "bot", ans)
            except Exception as e:
                log_event(cid, "error", f"send fail: {e}")

    def start_polling():
        STATE["tg"]["poll"]=True
        STATE["tg"]["offset"]=None

    def stop_polling():
        STATE["tg"]["poll"]=False

    def set_auto(flag: bool):
        STATE["tg"]["auto"]=bool(flag)
        return f"Auto-responder: {'ON' if STATE['tg']['auto'] else 'OFF'}"

    def poll_once():
        if not STATE["tg"]["poll"]: return "Polling OFF"
        try:
            upd = tg_get_updates(STATE["tg"]["token"], offset=STATE["tg"]["offset"], timeout=10)
            for item in upd.get("result", []):
                STATE["tg"]["offset"] = item["update_id"] + 1
                _handle_incoming(item)
            return f"OK, updates: {len(upd.get('result', []))}"
        except Exception as e:
            return f"❌ Poll error: {e}"

In [8]:
# === TALENTO TECH — TELEGRAM v1++ (auto + alerta “Asesor/Ayuda” + intervención) ===
# - Respeta TELEGRAM_BOT_TOKEN de env/userdata o campo de la pestaña Admin
# - Polling por hilo (no bloquea la UI) y deleteWebhook
# - Auto-responder estilo v1: si hay responder() la usa; sino rag_answer(); sino retrieve+LLM; sino fragmento
# - Al detectar “asesor/ayuda”:
#     * contesta al usuario
#     * avisa al ADMIN (si está configurado)
#     * pone "hold" (pausa auto) en ese chat por 120s (configurable)
# - Al enviar como admin a un chat, se pone "hold" 60s para no pisarse con el bot
# - Comandos desde el chat ADMIN:
#     /admin                         -> marca ese chat como admin
#     /auto_on <chat_id>             -> habilita auto en ese chat
#     /auto_off <chat_id>            -> deshabilita auto en ese chat
#     /say <chat_id> <mensaje...>    -> envía mensaje al usuario (y activa hold 60s)
#
# Reexpone: ui_set_token, ui_poll_toggle, ui_poll_once, ui_list_chats, ui_admin_send, ui_auto_toggle

import os, re, json, time, threading, requests
from pathlib import Path

# ------------------------- Helpers LLM (no invasivo) ----------------------------
def _tt_openai():
    try:
        from openai import OpenAI
        key = os.getenv("OPENAI_API_KEY")
        return OpenAI(api_key=key) if key else None
    except Exception:
        return None

def _tt_gemini():
    try:
        import google.generativeai as genai
        key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
        if not key: return None
        genai.configure(api_key=key)
        return genai.GenerativeModel(os.getenv("GEMINI_MODEL","gemini-1.5-flash"))
    except Exception:
        return None

def _tt_llm_answer(q, ctx):
    prov = (os.getenv("LLM_PROVIDER") or "openai").lower()
    temp = float(os.getenv("LLM_TEMPERATURE","0.2"))
    mtok = int(os.getenv("LLM_MAX_TOKENS","400"))
    sys_prompt = ("Eres un asistente que responde SOLO con la información del contexto. "
                  "Si no está en el contexto di: 'No encuentro esa información en mis archivos'. "
                  "Responde en español y añade 'Fuentes'.")

    user = f"Pregunta: {q}\n\nContexto:\n{ctx}\n\nResponde SOLO con lo anterior."
    if prov == "openai":
        cli = _tt_openai()
        if not cli: return "⚠️ Sin LLM (OPENAI_API_KEY no configurada)"
        try:
            msgs=[{"role":"system","content":sys_prompt},{"role":"user","content":user}]
            out=cli.chat.completions.create(model=os.getenv("OPENAI_MODEL","gpt-4o-mini"),
                                            messages=msgs, temperature=temp, max_tokens=mtok)
            return out.choices[0].message.content.strip()
        except Exception as e:
            return f"⚠️ Error OpenAI: {e}"
    if prov == "gemini":
        model = _tt_gemini()
        if not model: return "⚠️ Sin LLM (GOOGLE_API_KEY/GEMINI_API_KEY no configurada)"
        try:
            out = model.generate_content(f"{sys_prompt}\n\n{user}")
            return (getattr(out,"text","") or "").strip()
        except Exception as e:
            return f"⚠️ Error Gemini: {e}"
    return "⚠️ Sin LLM: muestra el fragmento más relevante."

def _tt_build_context(q):
    try:
        if 'retrieve' in globals():
            hits = retrieve(q, top_k=4)
            parts=[]
            for score, ch in hits:
                snip=(ch.text[:350]+"…") if len(ch.text)>350 else ch.text
                src=getattr(ch,'source_name','doc')
                parts.append(f"[{src} | {score:.3f}] {snip}")
            return "\n\n".join(parts)
    except Exception:
        pass
    return ""

def _tt_answer_logic(q):
    q = (q or "").strip()
    if not q:
        return "Envíame un texto y responderé con lo que haya en tus archivos."
    # 1) Tu responder()
    if 'responder' in globals():
        try: return responder(q)
        except Exception as e: return f"⚠️ Error en responder(): {e}"
    # 2) Tu rag_answer()
    if 'rag_answer' in globals():
        try:
            ans,_ = rag_answer(q, top_k=4, temperature=float(os.getenv("LLM_TEMPERATURE","0.2")))
            return ans
        except Exception as e:
            return f"⚠️ Error en rag_answer(): {e}"
    # 3) retrieve + LLM
    ctx = _tt_build_context(q) or "(sin contexto)"
    return _tt_llm_answer(q, ctx)

# ---------------------------- Telegram Manager ----------------------------------
class TTGram:
    HOLD_AFTER_ADMIN = 60      # s de pausa auto tras intervención admin
    HOLD_AFTER_ALERT = 120     # s de pausa auto tras “asesor/ayuda”
    ALERT_REGEX = re.compile(r"\b(asesor|ayuda)\b", re.I)

    def __init__(self):
        self.token = (
            os.getenv("TELEGRAM_BOT_TOKEN") or
            (userdata.get("TELEGRAM_BOT_TOKEN") if 'userdata' in globals() else None) or
            ""
        ).strip()
        self.offset = None
        self.global_auto = True
        self.stop = threading.Event()
        self.thread = None
        self.known = {}          # chat_id -> {'title','last_text','auto'(opt), 'hold_until'(opt)}
        self.admin_chat_id = (os.getenv("ADMIN_CHAT_ID") or "").strip()

        self.logs = Path.cwd()/ "mini_chatbot_work" / "logs"
        self.logs.mkdir(parents=True, exist_ok=True)

    # ---------- HTTP ----------
    def _delete_webhook(self):
        if not self.token: return
        try: requests.get(f"https://api.telegram.org/bot{self.token}/deleteWebhook", timeout=10)
        except Exception: pass

    def _me(self):
        if not self.token: return "No TELEGRAM_BOT_TOKEN configurado."
        try:
            r=requests.get(f"https://api.telegram.org/bot{self.token}/getMe", timeout=10)
            r.raise_for_status(); data=r.json()
            return "Bot: @" + (data.get("result",{}).get("username") or "(sin username)")
        except Exception as e:
            return f"Token/Bot error: {e}"

    def _send(self, chat_id, text):
        if not self.token: return "Sin token"
        try:
            r=requests.post(f"https://api.telegram.org/bot{self.token}/sendMessage",
                            json={"chat_id": str(chat_id), "text": text}, timeout=10)
            r.raise_for_status(); return "✅ Enviado"
        except Exception as e:
            return f"❌ Error: {e}"

    # ---------- Utils ----------
    def _log(self, cid, role, text):
        try:
            p=self.logs/f"{cid}.jsonl"
            rec={"t": time.time(), "role": role, "text": text}
            with open(p,"a",encoding="utf-8") as f: f.write(json.dumps(rec,ensure_ascii=False)+"\n")
        except Exception:
            pass

    def _should_auto(self, cid):
        meta = self.known.get(cid, {})
        # hold por tiempo
        if time.time() < float(meta.get("hold_until", 0)): return False
        # override por chat o global
        if "auto" in meta: return bool(meta["auto"])
        return self.global_auto

    def _set_hold(self, cid, seconds):
        meta = self.known.setdefault(cid, {})
        meta["hold_until"] = time.time() + float(seconds)

    def _set_auto_chat(self, cid, flag: bool):
        self.known.setdefault(cid, {})["auto"] = bool(flag)

    # ---------- Core ----------
    def handle_update(self, item):
        msg=item.get("message") or {}
        chat=msg.get("chat") or {}
        cid=str(chat.get("id"))
        txt=(msg.get("text") or "").strip()
        title=chat.get("username") or chat.get("title") or ""
        if not cid or not txt: return

        self.known.setdefault(cid, {"title": title, "last_text": ""})
        self.known[cid]["last_text"] = txt
        self._log(cid,"user",txt)

        # Comandos admin desde su propio chat
        if cid == self.admin_chat_id:
            if txt == "/admin":
                self._send(cid, "Ya eres canal admin ✅")
                return
            if txt.startswith("/auto_on"):
                parts = txt.split()
                if len(parts)>=2:
                    self._set_auto_chat(parts[1], True)
                    self._send(cid, f"Auto ON para {parts[1]}")
                return
            if txt.startswith("/auto_off"):
                parts = txt.split()
                if len(parts)>=2:
                    self._set_auto_chat(parts[1], False)
                    self._send(cid, f"Auto OFF para {parts[1]}")
                return
            if txt.startswith("/say"):
                # /say <chat_id> <mensaje...>
                parts = txt.split(maxsplit=2)
                if len(parts)>=3:
                    tgt, m = parts[1], parts[2]
                    self._send(tgt, m)
                    self._log(tgt, "admin", m)
                    self._set_hold(tgt, self.HOLD_AFTER_ADMIN)
                return

        # Si alguien dice “admin” en privado -> tomar ese chat como admin
        if txt == "/admin" and not self.admin_chat_id:
            self.admin_chat_id = cid
            os.environ["ADMIN_CHAT_ID"] = cid
            self._send(cid, "Este chat queda configurado como canal admin ✅")
            return

        # Alerta por “asesor/ayuda”
        if self.ALERT_REGEX.search(txt):
            # Responde y deja en hold para que intervenga humano
            ans = _tt_answer_logic(txt)
            self._send(cid, ans); self._log(cid,"bot",ans)
            self._set_hold(cid, self.HOLD_AFTER_ALERT)
            if self.admin_chat_id:
                self._send(self.admin_chat_id,
                           f"⚠️ ALERTA: {cid} pidió ayuda (“{txt}”).\n"
                           f"Usa /say {cid} <mensaje> para intervenir, o /auto_on {cid} cuando quieras reanudar el bot.")
            return

        # Auto-responder (estilo v1)
        if self._should_auto(cid):
            ans = _tt_answer_logic(txt)
            self._send(cid, ans); self._log(cid,"bot",ans)

    def poll_once(self):
        if not self.token: return "Sin TELEGRAM_BOT_TOKEN"
        try:
            params={"timeout":10}
            if self.offset is not None: params["offset"]=self.offset
            r=requests.get(f"https://api.telegram.org/bot{self.token}/getUpdates",
                           params=params, timeout=15)
            r.raise_for_status(); data=r.json()
            for it in data.get("result", []):
                self.offset = it["update_id"] + 1
                self.handle_update(it)
            return f"OK, updates: {len(data.get('result', []))}"
        except Exception as e:
            return f"❌ Poll error: {e}"

    def _loop(self):
        while not self.stop.is_set():
            self.poll_once()
            for _ in range(10):
                if self.stop.is_set(): break
                time.sleep(0.2)

    # ---------- API para la pestaña Admin ----------
    def set_token(self, token: str):
        # Allow empty token: reuse current or env/userdata
        token = (token or self.token or os.getenv('TELEGRAM_BOT_TOKEN') or                  (userdata.get('TELEGRAM_BOT_TOKEN') if 'userdata' in globals() else '')).strip()
        if token.upper().startswith('BOT:'):
            token = token.split(':',1)[1].strip()
        self.token = token
        if token:
            os.environ['TELEGRAM_BOT_TOKEN']=token
            if 'userdata' in globals():
                try:
                    userdata['TELEGRAM_BOT_TOKEN']=token
                except Exception:
                    pass
        self._delete_webhook()
        return self._me()


    def toggle_poll(self, flag: bool):
        if flag:
            if self.thread and self.thread.is_alive():
                return "Auto-escuchar: ya activo."
            self.stop.clear()
            self._delete_webhook()
            self.thread = threading.Thread(target=self._loop, daemon=True)
            self.thread.start()
            return "Auto-escuchar: ON"
        else:
            self.stop.set()
            return "Auto-escuchar: OFF"

    def poll_now(self):
        return self.poll_once()

    def list_chats(self):
        if not self.known:
            return "(sin chats aún — envía /start al bot y pulsa 'Leer ahora')"
        return "\n".join([f"{cid} | {meta.get('title','')} | {meta.get('last_text','')[:60]}"
                          for cid,meta in self.known.items()])

    def admin_send(self, cid: str, text: str):
        out = self._send(cid, text or "(mensaje vacío)")
        if out.startswith("✅"):
            self._log(cid, "admin", text or "")
            self._set_hold(cid, self.HOLD_AFTER_ADMIN)
        return out

    def auto_toggle(self, flag: bool):
        self.global_auto = bool(flag)
        return f"Auto-responder (global): {'ON' if self.global_auto else 'OFF'}"

_TTG = TTGram()

# Rebind para tu UI (mantiene nombres que ya usas)
globals()["ui_set_token"]   = _TTG.set_token
globals()["ui_poll_toggle"] = _TTG.toggle_poll
globals()["ui_poll_once"]   = _TTG.poll_now
globals()["ui_list_chats"]  = _TTG.list_chats
globals()["ui_admin_send"]  = _TTG.admin_send
globals()["ui_auto_toggle"] = _TTG.auto_toggle

print("🔐 TELEGRAM_BOT_TOKEN len =", len((_TTG.token or "")), "|", _TTG._me())
print("Telegram v1++ listo: auto, alerta 'Asesor/Ayuda', y control de intervención.")
# ================================================================================


🔐 TELEGRAM_BOT_TOKEN len = 46 | Bot: @bootcamps_explorador_bot
Telegram v1++ listo: auto, alerta 'Asesor/Ayuda', y control de intervención.


## 7) UI Gradio — S3/equipo, Docs, Índice, Persistencia, Preguntas, Telegram (Admin)

In [9]:
# === Restored S3 helpers & STATE (clean implementation) ===
from pathlib import Path
import os

# Carpeta local donde se guardan docs
DOCS_DIR = Path('docs')
DOCS_DIR.mkdir(parents=True, exist_ok=True)

# Estado global de la ruta activa
STATE = {
    'bucket': '',
    'base_prefix': '',  # Debe terminar en '/' o ser ''
    'team_folder': '',
}

def norm_prefix(p: str) -> str:
    p = (p or '').strip()
    if p and not p.endswith('/'):
        p += '/'
    return p

def effective_team_prefix(base_prefix: str, team_folder: str) -> str:
    bp = norm_prefix(base_prefix)
    tf = (team_folder or '').strip()
    return f"{bp}{tf}/" if tf else bp

def s3_client_autoregion(bucket: str):
    try:
        import boto3
        sess = boto3.session.Session()
        s3 = sess.client('s3')
        try:
            loc = s3.get_bucket_location(Bucket=bucket).get('LocationConstraint') or 'us-east-1'
            s3 = sess.client('s3', region_name=loc)
        except Exception:
            pass
        return s3
    except Exception as e:
        raise RuntimeError(f"boto3 requerido para S3: {e}")

def apply_route(bucket, base_prefix, team_folder):
    """Configura la ruta activa y asegura las subcarpetas docs/ e index/ en S3."""
    STATE['bucket'] = (bucket or '').strip() or os.getenv('S3_BUCKET','')
    STATE['base_prefix'] = norm_prefix(base_prefix or os.getenv('S3_PREFIX',''))
    STATE['team_folder'] = (team_folder or '').strip()

    eff_root = effective_team_prefix(STATE['base_prefix'], STATE['team_folder'])
    if STATE['bucket']:
        try:
            s3 = s3_client_autoregion(STATE['bucket'])
            # Crear 'carpetas' lógicas
            for sub in ('docs/', 'index/'):
                key = eff_root + sub
                s3.put_object(Bucket=STATE['bucket'], Key=key, Body=b'')
        except Exception:
            # Si no hay credenciales/permiso, seguimos pero devolvemos la ruta igual
            pass

    return f"✔ Ruta: s3://{STATE['bucket']}/{eff_root}"

# ---- Utilidades S3 adicionales usadas por la UI (solo se definen si faltan) ----
def _ensure_defs():
    globals_ = globals()

    if 's3_list_immediate_folders' not in globals_:
        def s3_list_immediate_folders(bucket: str, base_prefix: str):
            """Lista subcarpetas inmediatas bajo base_prefix usando Delimiter='/'."""
            try:
                s3 = s3_client_autoregion(bucket)
                base_prefix = norm_prefix(base_prefix)
                resp = s3.list_objects_v2(Bucket=bucket, Prefix=base_prefix, Delimiter='/')
                return [cp['Prefix'][len(base_prefix):-1] for cp in resp.get('CommonPrefixes', [])]
            except Exception:
                return []
        globals_['s3_list_immediate_folders'] = s3_list_immediate_folders

    if 's3_upload_local_docs' not in globals_:
        def s3_upload_local_docs(bucket=None, base_prefix=None, team_folder=None):
            bucket = (bucket or STATE['bucket']).strip()
            base_prefix = norm_prefix(base_prefix or STATE['base_prefix'])
            team_folder = (team_folder or STATE['team_folder']).strip()
            eff_root = effective_team_prefix(base_prefix, team_folder) + 'docs/'
            try:
                s3 = s3_client_autoregion(bucket)
                n=0
                for p in DOCS_DIR.glob('*'):
                    if p.is_file():
                        key = eff_root + p.name
                        s3.upload_file(str(p), bucket, key)
                        n+=1
                return f"Subidos {n} archivos a s3://{bucket}/{eff_root}"
            except Exception as e:
                return f"⚠️ No se pudo subir a S3: {e}"
        globals_['s3_upload_local_docs'] = s3_upload_local_docs

    if 's3_download_docs' not in globals_:
        def s3_download_docs(bucket=None, base_prefix=None, team_folder=None):
            bucket = (bucket or STATE['bucket']).strip()
            base_prefix = norm_prefix(base_prefix or STATE['base_prefix'])
            team_folder = (team_folder or STATE['team_folder']).strip()
            eff_root = effective_team_prefix(base_prefix, team_folder) + 'docs/'
            try:
                s3 = s3_client_autoregion(bucket)
                DOCS_DIR.mkdir(parents=True, exist_ok=True)
                paginator = s3.get_paginator('list_objects_v2')
                n=0
                for page in paginator.paginate(Bucket=bucket, Prefix=eff_root):
                    for obj in page.get('Contents', []):
                        key = obj['Key']
                        name = key.split('/')[-1]
                        if not name: continue
                        s3.download_file(bucket, key, str(DOCS_DIR/name))
                        n+=1
                return f"Descargados {n} archivos de s3://{bucket}/{eff_root}"
            except Exception as e:
                return f"⚠️ No se pudo descargar de S3: {e}"
        globals_['s3_download_docs'] = s3_download_docs

    if 'load_index_local' not in globals_:
        def load_index_local():
            """Stub mínimo: en tu notebook original se carga el índice aquí. Devuelve True si hay docs."""
            return any(DOCS_DIR.glob('*'))
        globals_['load_index_local'] = load_index_local

    if 'build_index_from_local' not in globals_:
        def build_index_from_local(paths, chunk_size: int, overlap: int):
            """Stub: simula el conteo de docs y chunks."""
            n_docs = len([p for p in paths if Path(p).is_file()])
            n_chunks = max(1, n_docs) * max(1, (chunk_size // max(1, overlap)))
            return n_chunks, n_docs
        globals_['build_index_from_local'] = build_index_from_local

    if 's3_rebuild_from_docs' not in globals_:
        def s3_rebuild_from_docs(bucket, base_prefix, team_folder, chunk_size: int, overlap: int):
            _ = s3_download_docs(bucket, base_prefix, team_folder)
            paths = list(DOCS_DIR.glob('*'))
            n_chunks, n_docs = build_index_from_local(paths, chunk_size, overlap)
            return f"Reconstruido índice: {n_docs} docs → {n_chunks} chunks."
        globals_['s3_rebuild_from_docs'] = s3_rebuild_from_docs

    if 's3_upload_index' not in globals_:
        def s3_upload_index(bucket=None, base_prefix=None, team_folder=None):
            bucket = (bucket or STATE['bucket']).strip()
            base_prefix = norm_prefix(base_prefix or STATE['base_prefix'])
            team_folder = (team_folder or STATE['team_folder']).strip()
            eff_root = effective_team_prefix(base_prefix, team_folder) + 'index/'
            # Aquí deberías subir tus archivos de índice reales; dejamos mensaje.
            return f"(demo) Índice marcado como subido a s3://{bucket}/{eff_root}"
        globals_['s3_upload_index'] = s3_upload_index

    if 's3_download_index' not in globals_:
        def s3_download_index(bucket=None, base_prefix=None, team_folder=None):
            # Aquí deberías descargar tus archivos de índice reales; dejamos mensaje.
            return "(demo) Índice descargado (stub)."
        globals_['s3_download_index'] = s3_download_index

_ensure_defs()


In [None]:
import gradio as gr


def ui_apply_s3(bucket, base_prefix, team_folder):
    return apply_route(bucket, base_prefix, team_folder)

def ui_list_teams(bucket, base_prefix):
    bucket = (bucket or '').strip() or os.getenv("S3_BUCKET")
    base_prefix = norm_prefix(base_prefix or os.getenv("S3_PREFIX") or "")
    teams = s3_list_immediate_folders(bucket, base_prefix)
    return "\n".join(teams) if teams else "(sin subcarpetas)"

def ui_upload(files):
    saved=[]
    DOCS_DIR.mkdir(parents=True, exist_ok=True)
    for name, b in files:
        p=DOCS_DIR/Path(name).name
        with open(p,"wb") as f: f.write(b)
        saved.append(str(p))
    return f"Guardados local: { [Path(p).name for p in saved] }"

def ui_sync_local_to_s3():
    return s3_upload_local_docs(STATE["bucket"], STATE["base_prefix"], STATE["team_folder"])

def ui_sync_s3_to_local():
    return s3_download_docs(STATE["bucket"], STATE["base_prefix"], STATE["team_folder"])

def ui_build_index(chunk_size, overlap):
    paths=list(Path(DOCS_DIR).glob("*"))
    if not paths: return "Sube o sincroniza documentos primero."
    n_chunks, n_docs = build_index_from_local(paths, int(chunk_size), int(overlap))
    return f"✅ Índice local: {n_docs} docs → {n_chunks} chunks."

def ui_rebuild_from_s3(chunk_size, overlap):
    return s3_rebuild_from_docs(STATE["bucket"], STATE["base_prefix"], STATE["team_folder"], int(chunk_size), int(overlap))

def ui_save_index():
    return s3_upload_index(STATE["bucket"], STATE["base_prefix"], STATE["team_folder"])

def ui_load_index():
    return s3_download_index(STATE["bucket"], STATE["base_prefix"], STATE["team_folder"])

def ui_ask(q, top_k, temp):
    if not load_index_local():
        return "Primero carga/crea un índice.", ""
    ans, fuentes = rag_answer(q, int(top_k), float(temp))
    return ans, "\n".join(sorted(set(fuentes)))


with gr.Blocks(title="Chatbot RAG + S3 + Telegram (Admin & equipos)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("### 🧠 Cerebro — Equipo → Docs → Índice → Persistencia → Preguntas → **Telegram Admin**")

    with gr.Tab("0) S3 (Equipo)"):
        with gr.Row():
            b = gr.Textbox(value=os.getenv("S3_BUCKET"), label="S3_BUCKET")
            bp= gr.Textbox(value=os.getenv("S3_PREFIX"), label="Prefijo base del curso (p.ej. IA-Innovador/)")
            tf= gr.Textbox(label="Carpeta del equipo (p.ej. Daniel)")
        s3_state = gr.Textbox(label="Estado de la ruta activa", lines=3)
        teams_out = gr.Textbox(label="Carpetas encontradas", lines=6)
        with gr.Row():
            btn_apply = gr.Button("Aplicar ruta")
            btn_list  = gr.Button("Listar carpetas existentes (bajo prefijo base)")
        btn_apply.click(ui_apply_s3, inputs=[b,bp,tf], outputs=s3_state)
        btn_list.click(ui_list_teams, inputs=[b,bp], outputs=teams_out)

    with gr.Tab("1) Docs"):
        files=gr.File(label="Sube PDF/TXT/CSV (opcional, puedes solo sincronizar S3→local)", file_count="multiple", type="binary")
        out=gr.Textbox(label="Estado", lines=3)
        with gr.Row():
            btn_up  = gr.Button("Subir a local")
            btn_s2l = gr.Button("S3 → Local (descargar docs)")
            btn_l2s = gr.Button("Local → S3 (subir docs)")
        btn_up.click(ui_upload, inputs=files, outputs=out)
        btn_s2l.click(ui_sync_s3_to_local, outputs=out)
        btn_l2s.click(ui_sync_local_to_s3, outputs=out)

    with gr.Tab("2) Índice"):
        out2=gr.Textbox(label="Estado", lines=4)
        with gr.Row():
            cs=gr.Number(value=800, precision=0, label="chunk_size")
            ov=gr.Number(value=150, precision=0, label="overlap")
        with gr.Row():
            btn_build = gr.Button("Construir índice desde LOCAL")
            btn_reb_s3= gr.Button("Reconstruir índice desde DOCS en S3")
        btn_build.click(ui_build_index, inputs=[cs,ov], outputs=out2)
        btn_reb_s3.click(ui_rebuild_from_s3, inputs=[cs,ov], outputs=out2)

    with gr.Tab("3) Persistencia"):
        out3=gr.Textbox(label="Estado", lines=4)
        with gr.Row():
            btn_save = gr.Button("⬆️ Guardar índice en S3")
            btn_load = gr.Button("⬇️ Cargar índice desde S3")
        btn_save.click(ui_save_index, outputs=out3)
        btn_load.click(ui_load_index, outputs=out3)
        gr.Markdown("Se intenta en `.../<equipo>/` y en `.../<equipo>/index/`.")

    with gr.Tab("4) Preguntar"):
        q=gr.Textbox(label="Pregunta")
        with gr.Row():
            tk=gr.Slider(value=4, minimum=1, maximum=10, step=1, label="top_k")
            tp=gr.Slider(value=0.2, minimum=0.0, maximum=1.2, step=0.1, label="temperature")
        ans=gr.Markdown("Respuesta")
        src=gr.Textbox(label="Fuentes", lines=4)
        btn_q=gr.Button("Consultar")
        btn_q.click(ui_ask, inputs=[q, tk, tp], outputs=[ans, src])

    with gr.Tab("5) Telegram (Admin)"):
        with gr.Row():
            tok=gr.Textbox(label="BOT TOKEN", type="password", value=os.getenv("TELEGRAM_BOT_TOKEN",""))
            set_out=gr.Textbox(label="Bot", interactive=False)
            btn_tok=gr.Button("Guardar token / Ver bot")
            btn_detect=gr.Button("Usar token existente")
            btn_tok.click(ui_set_token, inputs=tok, outputs=set_out)
            btn_detect.click(lambda: ui_set_token(""), outputs=set_out)
        with gr.Row():
            auto=gr.Checkbox(label="Auto-responder (bot responde solo con RAG+LLM)", value=False)
            auto_out=gr.Textbox(label="Estado", interactive=False)
            auto.change(lambda v: ui_auto_toggle(v), inputs=auto, outputs=auto_out)
        with gr.Row():
            poll=gr.Checkbox(label="Polling (escuchar mensajes)", value=False)
            poll_out=gr.Textbox(label="Estado", interactive=False)
            poll.change(lambda v: ui_poll_toggle(v), inputs=poll, outputs=poll_out)
            step=gr.Button("Forzar lectura ahora (poll once)")
            step_out=gr.Textbox(label="Resultado", interactive=False)
            step.click(ui_poll_once, outputs=step_out)
        with gr.Row():
            chat_list=gr.Button("Listar chats conocidos")
            list_out=gr.Textbox(label="chats", lines=6)
            chat_list.click(ui_list_chats, outputs=list_out)
        with gr.Row():
            sel_cid=gr.Textbox(label="Enviar a chat_id (pega uno de la lista)")
            msg=gr.Textbox(label="Mensaje del admin", value="Hola, soy soporte. ¿En qué te ayudo?")
            send_out=gr.Textbox(label="Resultado", interactive=False)
            send=gr.Button("Enviar como admin")
            send.click(ui_admin_send, inputs=[sel_cid, msg], outputs=send_out)

demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ba33c77f1591e59c5d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
# === PARCHE ÚNICO: handoff "Asesor/Ayuda", sin "Fuentes" en Telegram, espejo a admin y reuso de token ===
import os, re, requests, time, json
from pathlib import Path

# 1) LLM: NO pedir "Fuentes" dentro del texto y permitir activarlo opcionalmente para la UI
def _tt_openai():
    try:
        from openai import OpenAI
        key = os.getenv("OPENAI_API_KEY")
        return OpenAI(api_key=key) if key else None
    except Exception:
        return None

def _tt_gemini():
    try:
        import google.generativeai as genai
        key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
        if not key:
            return None
        genai.configure(api_key=key)
        return genai.GenerativeModel(os.getenv("GEMINI_MODEL","gemini-1.5-flash"))
    except Exception:
        return None

def _tt_llm_answer(q, ctx, with_sources=False):
    prov = (os.getenv("LLM_PROVIDER") or "openai").lower()
    temp = float(os.getenv("LLM_TEMPERATURE","0.2"))
    mtok = int(os.getenv("LLM_MAX_TOKENS","400"))
    sys_prompt = ("Eres un asistente que responde SOLO con la información del contexto. "
                  "Si no está en el contexto di: 'No encuentro esa información en mis archivos'. "
                  "Responde en español de forma clara y breve.")
    user = f"Pregunta: {q}\n\nContexto:\n{ctx}\n\nResponde SOLO con lo anterior."

    if prov == "openai":
        cli = _tt_openai()
        if not cli:
            return "⚠️ Sin LLM (OPENAI_API_KEY no configurada)"
        out = cli.chat.completions.create(
            model=os.getenv("OPENAI_MODEL","gpt-4o-mini"),
            messages=[{"role":"system","content":sys_prompt},
                      {"role":"user","content":user}],
            temperature=temp, max_tokens=mtok
        )
        txt = out.choices[0].message.content.strip()
    elif prov == "gemini":
        model = _tt_gemini()
        if not model:
            return "⚠️ Sin LLM (GOOGLE_API_KEY/GEMINI_API_KEY no configurada)"
        out = model.generate_content(f"{sys_prompt}\n\n{user}")
        txt = (getattr(out,"text","") or "").strip()
    else:
        txt = "⚠️ Sin LLM disponible."

    if with_sources and ctx:
        # Añade lista de fuentes al final SOLO si se pide explícitamente (para la UI)
        fuentes = []
        for line in ctx.splitlines():
            if line.startswith('[') and ']' in line:
                tag = line.split(']')[0].strip('[]')
                name = tag.split('|')[0].strip()
                if name and name not in fuentes:
                    fuentes.append(name)
        if fuentes:
            txt = txt + "\n\nFuentes: " + ", ".join(fuentes)

    # Limpieza por si el modelo inventa una sección de "Fuentes:"
    if not with_sources:
        txt = re.sub(r"\n\s*Fuentes\s*:.*$", "", txt, flags=re.S)
    return txt

def _tt_build_context(q):
    try:
        if 'retrieve' in globals():
            hits = retrieve(q, top_k=4)
            parts=[]
            for score, ch in hits:
                snip=(ch.text[:350]+"…") if len(ch.text)>350 else ch.text
                src=getattr(ch,'source_name','doc')
                parts.append(f"[{src} | {score:.3f}] {snip}")
            return "\n\n".join(parts)
    except Exception:
        pass
    return ""

def _tt_answer_logic(q, for_telegram=False):
    q = (q or "").strip()
    if not q:
        return "Envíame un texto y responderé con lo que haya en tus archivos."

    # 1) Tu función personalizada (si existe)
    if 'responder' in globals():
        try:
            ans = responder(q)
            # por si responder() puso "Fuentes" adentro:
            if for_telegram:
                ans = re.sub(r"\n\s*Fuentes\s*:.*$", "", ans, flags=re.S)
            return ans
        except Exception as e:
            return f"⚠️ Error en responder(): {e}"

    # 2) Tu rag_answer() (si existe)
    if 'rag_answer' in globals():
        try:
            ans, _fuentes = rag_answer(q, top_k=4, temperature=float(os.getenv("LLM_TEMPERATURE","0.2")))
            if for_telegram:
                ans = re.sub(r"\n\s*Fuentes\s*:.*$", "", ans, flags=re.S)
            return ans
        except Exception as e:
            return f"⚠️ Error en rag_answer(): {e}"

    # 3) retrieve + LLM
    ctx = _tt_build_context(q) or "(sin contexto)"
    return _tt_llm_answer(q, ctx, with_sources=not for_telegram)

globals()['_tt_llm_answer'] = _tt_llm_answer
globals()['_tt_answer_logic'] = _tt_answer_logic

# 2) Monkey-patch de TTGram: handoff “Asesor/Ayuda” sin LLM, espejo al admin, reuso de token, sin fuentes en Telegram
if 'TTGram' in globals():
    TT = globals()['TTGram']

    def _tg_set_token(self, token: str):
        token = (token or getattr(self, 'token', '') or
                 os.getenv('TELEGRAM_BOT_TOKEN') or
                 (userdata.get('TELEGRAM_BOT_TOKEN') if 'userdata' in globals() else '')).strip()
        if token.upper().startswith("BOT:"):
            token = token.split(":",1)[1].strip()
        self.token = token
        if token:
            os.environ["TELEGRAM_BOT_TOKEN"] = token
            if 'userdata' in globals():
                try: userdata["TELEGRAM_BOT_TOKEN"]=token
                except Exception: pass
        try:
            requests.get(f"https://api.telegram.org/bot{token}/deleteWebhook", timeout=10)
        except Exception:
            pass
        # getMe
        try:
            r=requests.get(f"https://api.telegram.org/bot{token}/getMe", timeout=10)
            r.raise_for_status()
            data=r.json()
            return "Bot: @" + (data.get("result",{}).get("username") or "(sin username)")
        except Exception as e:
            return f"Token/Bot error: {e}"

    def _tg_handle_update(self, item):
        msg=item.get("message") or {}
        chat=msg.get("chat") or {}
        cid=str(chat.get("id"))
        txt=(msg.get("text") or "").strip()
        title=chat.get("username") or chat.get("title") or ""
        if not cid or not txt:
            return

        self.known.setdefault(cid, {"title": title, "last_text": ""})
        self.known[cid]["last_text"] = txt
        # log
        try:
            p=(Path.cwd()/ "mini_chatbot_work" / "logs"); p.mkdir(parents=True, exist_ok=True)
            with open(p/f"{cid}.jsonl","a",encoding="utf-8") as f:
                f.write(json.dumps({"t":time.time(),"role":"user","text":txt}, ensure_ascii=False)+"\n")
        except Exception:
            pass

        # comandos admin
        if cid == getattr(self,'admin_chat_id',''):
            if txt == "/admin":
                self._send(cid, "Ya eres canal admin ✅"); return
            if txt.startswith("/auto_on"):
                parts=txt.split()
                if len(parts)>=2:
                    self.known.setdefault(parts[1],{})["auto"]=True
                    self._send(cid, f"Auto ON para {parts[1]}"); return
            if txt.startswith("/auto_off"):
                parts=txt.split()
                if len(parts)>=2:
                    self.known.setdefault(parts[1],{})["auto"]=False
                    self._send(cid, f"Auto OFF para {parts[1]}"); return
            if txt.startswith("/say"):
                parts=txt.split(maxsplit=2)
                if len(parts)>=3:
                    tgt, m = parts[1], parts[2]
                    self._send(tgt, m)
                    # hold para no pisarnos
                    self.known.setdefault(tgt,{})["hold_until"] = time.time() + float(getattr(self,'HOLD_AFTER_ADMIN',60))
                    # espejo al admin ya es innecesario aquí
                    return

        if txt == "/admin" and not getattr(self,'admin_chat_id',''):
            self.admin_chat_id = cid
            os.environ["ADMIN_CHAT_ID"] = cid
            self._send(cid, "Este chat queda configurado como canal admin ✅")
            return

        # Handoff humano por "asesor/ayuda" — NO usar LLM aquí
        if re.search(r"\b(asesor|ayuda)\b", txt, re.I):
            handoff = ("He activado soporte humano. Un asesor se unirá en breve. "
                       "También puedes seguir escribiendo y lo revisaré.")
            self._send(cid, handoff)
            # pausa auto para ese chat
            self.known.setdefault(cid,{})["hold_until"] = time.time() + float(getattr(self,'HOLD_AFTER_ALERT',120))
            # ALERTA al admin
            if getattr(self,'admin_chat_id',''):
                self._send(self.admin_chat_id,
                           f"⚠️ ALERTA: {cid} pidió ayuda (“{txt}”). Usa /say {cid} <mensaje> o /auto_on {cid}.")
            return

        # Auto-responder (sin fuentes en Telegram)
        # respeta hold y auto por chat
        hold_until = float(self.known.get(cid,{}).get("hold_until", 0))
        if time.time() < hold_until:
            return
        auto = self.known.get(cid,{}).get("auto", getattr(self,'global_auto', True))
        if auto:
            ans = _tt_answer_logic(txt, for_telegram=True)
            self._send(cid, ans)
            # espejo al admin para que veas qué respondió
            if getattr(self,'admin_chat_id','') and getattr(self,'MIRROR_TO_ADMIN', True):
                try:
                    self._send(self.admin_chat_id, f"[BOT→{cid}] {ans}")
                except Exception:
                    pass

    # aplicar monkey-patch
    TT.set_token = _tg_set_token
    TT.handle_update = _tg_handle_update
    # marcar flags
    TT.MIRROR_TO_ADMIN = True

    # rebind ui_* a la instancia actual (o crear una)
    _TTG = globals().get('_TTG')
    if _TTG is None or _TTG.__class__.__name__ != 'TTGram':
        _TTG = TT()
        globals()['_TTG'] = _TTG
    globals()['ui_set_token']   = _TTG.set_token
    globals()['ui_poll_toggle'] = getattr(_TTG, 'toggle_poll', lambda v: "No toggle_poll")
    globals()['ui_poll_once']   = getattr(_TTG, 'poll_now',   lambda : "No poll_now")
    globals()['ui_list_chats']  = getattr(_TTG, 'list_chats', lambda : "(sin función)")
    globals()['ui_admin_send']  = getattr(_TTG, 'admin_send', lambda cid, m: "No admin_send")
    globals()['ui_auto_toggle'] = getattr(_TTG, 'auto_toggle',lambda v: "No auto_toggle")
    print("✅ TTGram parcheado (handoff + espejo admin + reuso token + sin fuentes en Telegram).")
else:
    print("⚠️ TTGram no está definido todavía. Ejecuta tu celda de Telegram y luego vuelve a correr esta celda.")

# 3) UI: si existe ui_ask(), evitar que “Fuentes” aparezca dentro del texto de respuesta (solo en el cuadro de fuentes)
if 'ui_ask' in globals():
    _orig_ui_ask = ui_ask
    def ui_ask_clean(q, top_k, temp):
        ans, fuentes = _orig_ui_ask(q, top_k, temp)
        ans = re.sub(r"\n\s*Fuentes\s*:.*$", "", ans, flags=re.S)
        return ans, fuentes
    globals()['ui_ask'] = ui_ask_clean
    print("✅ ui_ask ajustado para no duplicar 'Fuentes' dentro de la respuesta.")
else:
    print("ℹ️ ui_ask no está (o viene después). Si ves 'Fuentes' duplicado, ejecuta esta celda al final y vuelve a intentar.")

print("Parche aplicado. Si ya tienes token en env/userdata, en la pestaña 5 pulsa 'Usar token existente'.")


✅ TTGram parcheado (handoff + espejo admin + reuso token + sin fuentes en Telegram).
✅ ui_ask ajustado para no duplicar 'Fuentes' dentro de la respuesta.
Parche aplicado. Si ya tienes token en env/userdata, en la pestaña 5 pulsa 'Usar token existente'.


In [None]:
# === CUSTOM PATCH: actualizar prompt y manejo de asesor/ayuda ===
# Ajuste del SYS_PROMPT para evitar que el modelo añada 'Fuentes' automáticamente
try:
    SYS_PROMPT = (
        "Eres un asistente para preguntas y respuestas basado en archivos cargados por el usuario. "
        "Responde SOLO con la información del contexto. "
        "Si la respuesta no está en el contexto di: 'No encuentro esa información en mis archivos'. "
        "Responde en español de forma clara y breve. No incluyas fuentes en el texto."
    )
except Exception:
    pass

# Lógica de respuesta unificada: controla 'Fuentes' y usa temperatura de env

def _tt_answer_logic(q, for_telegram=False):
    q = (q or "").strip()
    if not q:
        return "Envíame un texto y responderé con lo que haya en tus archivos."
    # 1) función responder() personalizada del usuario
    if 'responder' in globals():
        try:
            ans = responder(q)
            if for_telegram:
                import re
                ans = re.sub(r"
\s*Fuentes\s*:.*$", "", ans, flags=re.S)
            return ans
        except Exception as e:
            return f"⚠️ Error en responder(): {e}"
    # 2) función rag_answer() personalizada
    if 'rag_answer' in globals():
        try:
            temp = float(os.getenv('LLM_TEMPERATURE', '0.2'))
            ans, fuentes = rag_answer(q, top_k=4, temperature=temp)
            if for_telegram:
                import re
                ans = re.sub(r"
\s*Fuentes\s*:.*$", "", ans, flags=re.S)
            return ans
        except Exception as e:
            return f"⚠️ Error en rag_answer(): {e}"
    # 3) Búsqueda con retrieve + LLM
    try:
        ctx = _tt_build_context(q) or "(sin contexto)"
    except Exception:
        ctx = "(sin contexto)"
    return _tt_llm_answer(q, ctx, with_sources=not for_telegram)

# Parchear manejo de mensajes para 'asesor' o 'ayuda': no usar LLM, avisar admin y pausar
try:
    import types
    def patched_handle_update(self, item):
        msg = item.get("message") or {}
        chat = msg.get("chat") or {}
        cid = str(chat.get("id"))
        txt = (msg.get("text") or "").strip()
        title = chat.get("username") or chat.get("title") or ""
        if not cid or not txt:
            return
        # actualizar registro de chats
        self.known.setdefault(cid, {"title": title, "last_text": ""})
        self.known[cid]["last_text"] = txt
        try:
            self._log(cid, "user", txt)
        except Exception:
            pass
        # Comandos del admin
        if cid == getattr(self, 'admin_chat_id',''):
            if txt == "/admin":
                self._send(cid, "Ya eres canal admin ✅")
                return
            if txt.startswith("/auto_on"):
                parts = txt.split()
                if len(parts)>=2:
                    self._set_auto_chat(parts[1], True)
                    self._send(cid, f"Auto ON para {parts[1]}")
                return
            if txt.startswith("/auto_off"):
                parts = txt.split()
                if len(parts)>=2:
                    self._set_auto_chat(parts[1], False)
                    self._send(cid, f"Auto OFF para {parts[1]}")
                return
            if txt.startswith("/say"):
                parts = txt.split(maxsplit=2)
                if len(parts)>=3:
                    tgt, m = parts[1], parts[2]
                    self._send(tgt, m)
                    try:
                        self._log(tgt, "admin", m)
                    except Exception:
                        pass
                    self._set_hold(tgt, getattr(self,'HOLD_AFTER_ADMIN',60))
                return
        # Configurar canal admin si aún no está
        if txt == "/admin" and not getattr(self, 'admin_chat_id',''):
            self.admin_chat_id = cid
            os.environ["ADMIN_CHAT_ID"] = cid
            self._send(cid, "Este chat queda configurado como canal admin ✅")
            return
        # Palabras clave de traspaso a humano
        if hasattr(self, 'ALERT_REGEX') and self.ALERT_REGEX.search(txt):
            hand_msg = ("He activado soporte humano. Un asesor se unirá en breve. "
                        "También puedes seguir escribiendo y lo revisaré.")
            self._send(cid, hand_msg)
            try:
                self._log(cid, "bot", hand_msg)
            except Exception:
                pass
            self._set_hold(cid, getattr(self,'HOLD_AFTER_ALERT',120))
            # avisar al admin
            if getattr(self,'admin_chat_id',''):
                self._send(self.admin_chat_id,
                           f"⚠️ ALERTA: {cid} pidió ayuda ("{txt}").
"
                           f"Usa /say {cid} <mensaje> para intervenir, o /auto_on {cid} cuando quieras reanudar el bot.")
            return
        # Respuesta automática
        if self._should_auto(cid):
            ans = _tt_answer_logic(txt, for_telegram=True)
            # limpiar cualquier sección de fuentes
            try:
                import re as _re
                ans = _re.sub(r"
\s*Fuentes\s*:.*$", "", ans, flags=_re.S)
            except Exception:
                pass
            self._send(cid, ans)
            try:
                self._log(cid, "bot", ans)
            except Exception:
                pass
            # espejo al admin
            if getattr(self,'admin_chat_id',''):
                self._send(self.admin_chat_id, f"[BOT→{cid}] {ans}")
    if 'TTGram' in globals():
        TTGram.handle_update = patched_handle_update
except Exception as e:
    print('Error al parchear manejo de mensajes:', e)
