In [None]:
%pip install --upgrade llama-index        # core
%pip install --upgrade llama-index-llms-google-genai  # Google / Gemini LLM integration
%pip install --upgrade llama-index-embeddings-google-genai  # embeddings via Google GenAI
%pip install --upgrade google-generativeai  # underlying Google 

In [None]:
!pip install langchain==0.0.304
!pip install langchain-google-genai==0.0.5
!pip install google-generativeai==0.3.2

In [None]:
# ========================== IMPORTS (all in one place)==========================
import os, json, textwrap
from typing import Tuple, Dict
# ---- LlamaIndex (for RAG over Pinecone) ----
from pinecone import Pinecone
from llama_index.core import Settings, VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding

In [None]:
from langchain.tools import Tool
from langchain.agents import create_react_agent, AgentExecutor
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
from langchain_google_genai import ChatGoogleGenerativeAI

print("ðŸŽ‰ ALL IMPORTS WORKING")

In [None]:
#================================================================================
# 1) EMBEDDING / LLM LAYER
# (match Pinecone: you embedded with text-embedding-004 â†’ 768-d)
#================================================================================
Settings.embed_model = GoogleGenAIEmbedding(
 model_name="models/text-embedding-004",
 api_key=""
)
# LlamaIndex LLM (used only for grounded synthesis in the tool)
Settings.llm = GoogleGenAI(
 model="gemini-2.5-flash",
 api_key="",
 temperature=0.2,
 max_tokens=2048,
)

In [None]:
# LangChain chat LLM for the AGENT (same model for consistency)
agent_llm = ChatGoogleGenerativeAI(
 model="gemini-2.5-flash",
 api_key="",
 temperature=0.2,
)

In [None]:
#================================================================================
# 2) DATA / INDEX LAYER (Pinecone ONLY; no local PDFs)
INDEX_NAME = "coffeeindex"
TEXT_KEY = "text" # change if your field is e.g. "page_content"
pc = Pinecone(api_key="")
pc_index = pc.Index(INDEX_NAME)
vstore = PineconeVectorStore(pinecone_index=pc_index,text_key=TEXT_KEY)
li_index = VectorStoreIndex.from_vector_store(vstore) # uses Settings.embed_model

In [None]:
#================================================================================
# 3) QUERY LAYER -- LlamaIndex â†’ wrapped as a LangChain Tool("ask_docs")
# - Retrieval first (no LLM)
# - Single-shot, grounded prompt: "Use ONLY the CONTEXT"
# - Inline citations like [S1]; we also return a machine-readablesources map
# - Guardrail: if unsupported â†’ return the fallback string (agent can failover)
#================================================================================

In [None]:
FALLBACK_STR = "No support in retrieved context."

def _li_grounded_answer(question: str, k: int = 5, per_source_chars: int = 900) -> Tuple[str, Dict]:
    """Run LlamaIndex RAG over Pinecone and produce (answer_text, sources_map)."""

    # 1) Retrieve (cost: ONE embedding call with 004)
    hits = li_index.as_retriever(similarity_top_k=k).retrieve(question)

    # 2) Build CONTEXT with numbered tags + store sources mapping for UI
    parts = []
    sources_map = {}

    for i, h in enumerate(hits, start=1):
        tag = f"S{i}"
        md = h.node.metadata or {}

        heading = (
            md.get("heading")
            or md.get("title")
            or "Untitled"
        )

        src = (
            md.get("url")
            or md.get("source")
            or md.get("file_path")
            or md.get("doc_id")
            or "N/A"
        )

        # Trim text to avoid huge prompt
        snippet = h.node.get_content()[:per_source_chars]

        # Build context block
        parts.append(f"[{tag}] {heading}\n{snippet}")

        # Store for UI
        sources_map[tag] = {"heading": heading, "source": src}

    # Join all parts into a single context string
    context = "\n\n---\n\n".join(parts) if parts else "(no results)"

    # 3) Strict, short, procedural prompt (single LLM call)
    prompt = f"""
You are a concise specialty-coffee expert.

RULES:
- Use ONLY the CONTEXT below.
- If the answer is not fully supported by the CONTEXT, reply exactly:
  "{FALLBACK_STR}"
- 4â€“6 sentences max.
- After each factual claim, add an inline citation like [S1], [S2].
- Do NOT use outside knowledge.

QUESTION:
{question}

CONTEXT (numbered sources):
{context}

ANSWER (with inline [S#] citations):
""".strip()

    # Final LLM call (ONE Gemini generation)
    answer = Settings.llm.complete(prompt).text.strip()

    return answer, sources_map

In [None]:
#================================================================================
# 4) LANGCHAIN AGENT with MEMORY (Memory RAG)
# - The agent chooses when to call the 'ask_docs' tool.
# - If tool response == fallback, it asks a follow-up (guardrail).
#================================================================================
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                "You are a helpful assistant. Use the 'ask_docs' tool for any question "
                "that might be answered from the user's coffee knowledge base. "
                f"If the tool returns '{FALLBACK_STR}', ask a brief follow-up "
                "question to clarify."
            ),
        ),

        # Memory placeholder (conversation history)
        MessagesPlaceholder("chat_history"),

        # Current user input
        ("human", "{input}"),

        # ReAct agent scratchpad for tool calls / reasoning
        MessagesPlaceholder("agent_scratchpad"),
    ]
)


In [None]:
# =============================== EXAMPLES==================================
# 1) A doc-grounded question â†’ agent calls ask_docs, returns grounded answer + citations
resp1 = executor.invoke({"input": "What is turmeric coffee? Keep itshort."})
print("\n=== AGENT #1 ===\n", textwrap.fill(resp1["output"], 100))
# 2) A likely unsupported question â†’ tool returns fallback â†’ agent asks follow-up
resp2 = executor.invoke({"input": "Tell me about Ethiopian teaceremonies from 1800s."})
print("\n=== AGENT #2 ===\n", textwrap.fill(resp2["output"], 100))
# 3) Another doc question leveraging conversation memory (Memory RAG flavor)
resp3 = executor.invoke({"input": "And ideal brewing temperature guidance?"})
print("\n=== AGENT #3 ===\n", textwrap.fill(resp3["output"], 100))
