In [None]:
%pip install --upgrade llama-index        # core
%pip install --upgrade llama-index-llms-google-genai  # Google / Gemini LLM integration
%pip install --upgrade llama-index-embeddings-google-genai  # embeddings via Google GenAI
%pip install --upgrade google-generativeai  # underlying Google SDK

In [None]:
%pip install pinecone 

In [None]:
import os
from pinecone import Pinecone
from llama_index.core import (
 Settings,
 SimpleDirectoryReader,
 VectorStoreIndex,
 get_response_synthesizer,
)

In [None]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.core import KeywordTableIndex
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [None]:
# Optional: instrumentation to count LLM calls (version-safe)
from llama_index.core.callbacks import CallbackManager
try:
 from llama_index.core.callbacks import TokenCountingHandler
 counter = TokenCountingHandler()
 Settings.callback_manager = CallbackManager([counter])
except Exception:
 counter = None

In [None]:
# ============================ 1) TURN OFF LLM WHILE LOADING OFFLINE DOCS ============================
# This prevents any accidental LLM usage during PDF loading / keyword index build.
Settings.llm = None

In [None]:
# Load PDFs locally (NO LLM CALLS here)
try:
 docs = SimpleDirectoryReader("./GenAI/GenAI-NoteBooks/coffee_pages",required_exts=[".html"]).load_data()
except TypeError:
 docs = SimpleDirectoryReader("./GenAI/GenAI-NoteBooks/coffee_pages",file_exts=[".html"]).load_data()

In [None]:
# Build a pure keyword lookup index from the PDFs (NO LLM CALLS)
kw_idx = KeywordTableIndex.from_documents(docs, transformations=[])

In [None]:
# ============================ 2) ENABLE EMBEDDING + LLM FOR QUERIES============================
# Must match Pinecone's embedding space (you indexed with 004 /768-d)
Settings.embed_model = GoogleGenAIEmbedding(
 model_name="models/text-embedding-004",
 api_key=""or
os.environ["GEMINI_API_KEY"],
)

In [None]:
# Single-shot, no tools/AFC (kwargs ignored if unsupported--safe to include)
Settings.llm = GoogleGenAI(
 model="gemini-2.5-flash",
 api_key="" or
os.environ["GEMINI_API_KEY"],
temperature=0.2,
 max_tokens=512,
 tools=[],
 tool_config={"function_calling_config": "NONE",
"max_remote_calls": 0},
)

In [None]:
# ============================ 3) DATA / INDEX (Pinecone semantic side) ============================
pc = Pinecone(api_key="pcsk_3v68tN_L3G7scFvZJ8FtqsGh4T3yfHS86sXPfnojrAFUfx5D6XnDvHcWrYKy5T4CcRSZXs")
pc_index = pc.Index("coffeeindex")
vstore = PineconeVectorStore(pinecone_index=pc_index,text_key="text") # change if you used "page_content"
sem_idx = VectorStoreIndex.from_vector_store(vstore) # wraps Pinecone

In [None]:
# ================= Retrievers (no LLM calls; Pinecone will do one 004 embed per query) =================

sem_ret = sem_idx.as_retriever(similarity_top_k=5)
kw_ret  = kw_idx.as_retriever(similarity_top_k=5)


# ================= Hybrid retrieval with NO query expansion =================
# (prevents extra LLM work)

hybrid = QueryFusionRetriever(
    retrievers=[sem_ret, kw_ret],
    similarity_top_k=5,
    mode="reciprocal_rerank",
    num_queries=1,      # NO query expansion â†’ 1 means do NOT generate variations
    use_async=False
)


# ================= Force exactly ONE LLM generation per query =================
synth = get_response_synthesizer(
    llm=Settings.llm,
    response_mode="compact"
)

# ================= Build Query Engine =================
qe = RetrieverQueryEngine.from_args(
    retriever=hybrid,
    response_synthesizer=synth
)

import nest_asyncio
nest_asyncio.apply()

# ================= RUN FUNCTION =================
def run_one(q: str):
    if counter:
        counter.reset_counts()

    ans = qe.query(q)
    print(ans)

    if counter:
        print("LLM calls for this query:",
              getattr(counter, "total_llm_calls", "N/A"))   # should be 1


# ================= Run queries (each => exactly ONE LLM call) =================
run_one("What is turmeric coffee?")
run_one("Find entries mentioning ashwagandha in titles or headings.")
run_one("Explain brewing temperature in 2 bullets.")