In [20]:
%pip install --upgrade llama-index        # core
%pip install --upgrade llama-index-llms-google-genai  # Google / Gemini LLM integration
%pip install --upgrade llama-index-embeddings-google-genai  # embeddings via Google GenAI
%pip install --upgrade google-generativeai  # underlying Google SDK

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [21]:
import os
from pinecone import Pinecone
from llama_index.core import (
 Settings,
 VectorStoreIndex,
 SimpleDirectoryReader,
 KeywordTableIndex,)
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core.query_engine import RouterQueryEngine,SubQuestionQueryEngine
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.query_engine import SubQuestionQueryEngine


In [24]:
# ============================ 3) Query layer (Rerank + Dedup + Compress + Budget) ============================
def answer_with_controls(
    question: str,
    base_k: int = 20,
    rerank_top_n: int = 6,
    dedup_cutoff: float = 0.90,
    char_budget: int = 3500
):
    """
    Reranking: cross-encoder reorders by relevance
    Deduplication: drops near-duplicate nodes
    Compression: LLM extracts query-focused snippets
    Context budgeting: packs <= char_budget into the prompt
    """

    # --- 3.1 RETRIEVE (semantic) ---
    retriever = index.as_retriever(similarity_top_k=base_k)
    nodes = retriever.retrieve(question)  # 1 embedding call for Pinecone

    # --- 3.2 RERANK (cross-encoder) ---
    reranker = SentenceTransformerRerank(
        model="cross-encoder/ms-marco-MiniLM-L-6-v2",  # fast + accurate
        top_n=rerank_top_n,
    )
    nodes = reranker.postprocess_nodes(nodes, query_str=question)

    # --- 3.3 DEDUPLICATION (drop near-duplicates) ---
    dedup = SimilarityPostprocessor(similarity_cutoff=dedup_cutoff)
    nodes = dedup.postprocess_nodes(nodes)

    # --- 3.4 COMPRESSION (LLM extracts key parts) ---
    compressor = LLMChainExtractor.from_defaults(llm=Settings.llm)
    nodes = compressor.postprocess_nodes(nodes, query_str=question)

    # --- 3.5 CONTEXT BUDGETING (limit prompt size) ---
    context_parts = []
    total = 0

    for n in nodes:
        t = n.node.get_content()

        # If adding this chunk exceeds the total budget â†’ trim it
        if total + len(t) > char_budget:
            t = t[: max(0, char_budget - total)]

        # Only add non-empty text
        if t:
            context_parts.append(t)
            total += len(t)

        # Stop if we reached the limit
        if total >= char_budget:
            break

    context = "\n\n---\n\n".join(context_parts)

    # --- 3.6 FINAL SYNTHESIS (1 LLM call) ---
    prompt = f"""
Answer the QUESTION using only the CONTEXT. 
Be concise and cite sections briefly if helpful.

CONTEXT:
{context}

QUESTION:
{question}
"""

    return Settings.llm.complete(prompt).text