<a href="https://colab.research.google.com/github/durfred/my-first-binder/blob/main/Agentic_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain sentence-transformers pypdf

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=off" pip install llama-cpp-python==0.2.77 --force-reinstall --no-cache-dir --no-deps --quiet

# 2. Download Gemma 2B GGUF (quantized, CPU-friendly)
!wget https://huggingface.co/codegood/gemma-2b-it-Q4_K_M-GGUF/resolve/main/gemma-2b-it.Q4_K_M.gguf


In [None]:
!pip install faiss-cpu



In [None]:
# Agentic RAG with Local GGUF LLM (Gemma 2B, CPU-only)
# ✅ Compatible with Google Colab (CPU runtime)

import os
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp

# --- Load local Gemma 2B model ---
llm = LlamaCpp(
    model_path="gemma-2b-it.Q4_K_M.gguf",
    n_gpu_layers=0,
    n_ctx=2048,
    n_threads=4,
    max_tokens=256,
    temperature=0.3,
    f16_kv=True,
    verbose=False
)

# --- Sample article text for vector store ---
article_text = """
Agentic RAG enhances standard retrieval-augmented generation (RAG) by integrating autonomous agents.
These agents can perform subtasks such as query reformulation, source evaluation, and evidence-based planning.
Instead of a single monolithic response, agents cooperate through structured interaction to produce better answers.
For example, one agent may focus on generating follow-up questions, while another ranks retrieved documents.
This modularity improves interpretability and allows for fine-grained evaluation and debugging of each agent's contribution.
"""


# --- Text splitting and embeddings ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.create_documents([article_text])

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_index_gemma")

# --- Load vectorstore ---
vectorstore = FAISS.load_local("faiss_index_gemma", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever()

# --- RetrievalQA chain ---
retrieval_qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# --- Query reformulation tool ---
def query_reformulation(query: str) -> str:
    prompt = f"Rewrite the following query to be more specific and clear:\n{query}\nRewritten:"
    try:
        response = llm.invoke(prompt)
        # 只保留最后一句输出，避免多余文本
        return response.strip().split("Rewritten:")[-1].strip()
    except Exception as e:
        return f"Query reformulation failed: {e}"

# --- Self-evaluation tool ---
def self_evaluate(input_text: str) -> str:
    try:
        parts = input_text.split("|||")
        query = parts[0].replace("QUERY: ", "").strip()
        response = parts[1].replace("RESPONSE: ", "").strip()
        sources = parts[2].replace("SOURCES: ", "").strip() if len(parts) > 2 else ""

        prompt = f"""
Evaluate the following response:

QUERY: {query}
RESPONSE: {response}
SOURCES: {sources}

Rate the response on factual accuracy, completeness, relevance, and hallucination.
Provide a confidence score (0-10) with explanation.

Result:
"""
        result = llm.invoke(prompt)
        return result.strip().split("Result:")[-1].strip()
    except Exception as e:
        return f"Evaluation failed: {e}"

# --- Tools list ---
tools = [
    Tool(
    name="Article Retrieval",
    func=lambda input: retrieval_qa_chain({"query": input["query"] if isinstance(input, dict) and "query" in input else str(input)})["result"],
    description="Retrieve knowledge from local article."
),

]

# --- Memory ---
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="output"
)

# --- Agent initialization ---
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
    handle_parsing_errors=True,
    agent_kwargs={
        'format_instructions': """
Use the following format EXACTLY:

Question: the input question you must answer
Thought: think about what to do
Action: the action to take, must be one of [{tool_names}]
Action Input: the input to the action  <-- THIS LINE IS REQUIRED
Observation: result of the action
... (repeat Thought/Action/Action Input/Observation as needed)
Thought: I now know the final answer
Final Answer: the final answer to the input question <-- REQUIRED
"""
    }
)

from langchain.agents import AgentExecutor

agent_executor = AgentExecutor.from_agent_and_tools(
    agent=agent.agent,
    tools=tools,
    memory=memory,
    verbose=True,
    handle_parsing_errors=True,
    return_intermediate_steps=True,
)


# --- Evaluate and get response ---
def get_evaluated_response(query: str) -> dict:
    result = agent_executor.invoke({"input": query})
    response_raw = retrieval_qa_chain({"query": query})
    response =  response_raw['result']

    # 获取源文档（来自 QA chain）
    try:
        qa_result = retrieval_qa_chain({"query": query})
        sources = [doc.page_content for doc in qa_result.get("source_documents", [])]
        sources_text = "\n".join(sources)
    except Exception:
        sources_text = "No sources available"

    evaluation_input = f"QUERY: {query}|||RESPONSE: {response}|||SOURCES: {sources_text}"
    evaluation = self_evaluate(evaluation_input)

    return {
        "query": query,
        "response": response,
        "evaluation": evaluation,
        "sources": sources_text
    }

# --- Transparent output ---
def transparent_response(query: str):
    print("\n\nQuery:", query)
    result = get_evaluated_response(query)
    print("\n===== Query Result =====")
    print("Response:", result['response'])
    print("Confidence:", result['evaluation'])
    print("Sources:\n", result['sources'])

# --- Example usage ---
if __name__ == "__main__":
    transparent_response("Explain Agentic RAG approach.")
