In [None]:
pip install langchain langchain-community langchain-google-genai langchain-huggingface langchain-chroma sentence-transformers beautifulsoup4 rank_bm25 python-dotenv

In [None]:
import os
import getpass
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage, HumanMessage
from langchain.agents import create_agent
from langchain_google_genai import ChatGoogleGenerativeAI

In [6]:
if "GOOGLE_API_KEY" not in os.environ:

    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Ingresa la API Key de Gemini: ")

Ingresa la API Key de Gemini: ··········


In [22]:
# =========================================================
# 1. HYBRID RETRIEVER CON RRF (Reciprocal Rank Fusion)
# =========================================================
class HybridRetriever:
    """
    Usamos RRF y asigna un puntaje basado en la posición del documento en ambos rankings.
    """
    def __init__(self, vector_retriever, bm25_retriever, k=3, rrf_k=60):
        self.vector = vector_retriever
        self.bm25 = bm25_retriever
        self.k = k
        self.rrf_k = rrf_k

    def invoke(self, query: str):
        # Obtenemos más candidatos de los necesarios para poder re-rankear
        docs_vec = self.vector.invoke(query)
        docs_kw = self.bm25.invoke(query)

        scores = {}

        # Aplicamos fórmula RRF: 1 / (rrf_k + rank)
        for rank, doc in enumerate(docs_vec):
            scores[doc.page_content] = scores.get(doc.page_content, 0) + 1 / (self.rrf_k + rank + 1)

        for rank, doc in enumerate(docs_kw):
            scores[doc.page_content] = scores.get(doc.page_content, 0) + 1 / (self.rrf_k + rank + 1)

        # Ordenamos por score y reconstruimos los objetos Document
        sorted_content = sorted(scores.items(), key=lambda x: x[1], reverse=True)

        # Recuperamos la metadata original si es necesaria (aquí simplificado)
        unique_docs = []
        all_docs = docs_vec + docs_kw
        for content, score in sorted_content[:self.k]:
            for d in all_docs:
                if d.page_content == content:
                    unique_docs.append(d)
                    break
        return unique_docs

In [23]:
# =========================================================
# 2. MOTOR DE CONOCIMIENTO (Con Filtro de Wikipedia)
# =========================================================
class KnowledgeEngine:
    def __init__(self, url: str):
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )

        self.llm = ChatGoogleGenerativeAI(
            model="gemini-2.5-flash",
            temperature=0
        )

        self.load_and_process(url)

    def load_and_process(self, url: str):
        #Filtramos solo el contenido principal de Wikipedia (clase mw-parser-output)
        loader = WebBaseLoader(
            web_path=url,
            bs_kwargs={
                "parse_only": None
            }
        )
        # Filtro de etiquetas para evitar "basura" común
        loader.requests_kwargs = {'verify': False}
        docs = loader.load()

        # Limpieza simple de contenido para Wikipedia
        for doc in docs:
            if "mw-parser-output" in doc.page_content:
                 doc.page_content = doc.page_content.split("Véase también")[0]

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )

        self.splits = splitter.split_documents(docs)

        vectorstore = Chroma.from_documents(
            documents=self.splits,
            embedding=self.embeddings
        )

        # Aumentamos el fetch inicial a 10 para que RRF tenga de dónde elegir
        self.vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

        self.bm25_retriever = BM25Retriever.from_documents(self.splits)
        self.bm25_retriever.k = 10

        self.hybrid = HybridRetriever(
            self.vector_retriever,
            self.bm25_retriever,
            k=4
        )

In [None]:
# =========================================================
# 3. HERRAMIENTA Y AGENTE
# =========================================================
engine = KnowledgeEngine("https://es.wikipedia.org/wiki/Toxina")

@tool
def research_tool(query: str):
    """Busca información precisa en la base de conocimiento."""
    docs = engine.hybrid.invoke(query)
    return "\n\n".join([d.page_content for d in docs])

agent = create_agent(
    model=engine.llm,
    tools=[research_tool],
    system_prompt=SystemMessage(
      content="Eres un Analista de Datos Senior. Usa la herramienta para obtener hechos. "
    )
)

In [None]:
# =========================================================
# 4. EJECUCIÓN
# =========================================================
result = agent.invoke({
    "messages": [
        HumanMessage(content="Por quién fue introducido el término toxina?")
    ]
})

for message in result["messages"]:
    message.pretty_print()