# RAG da Colet√¢nea

Elabora√ß√£o de um sistema de LLM+RAG (Retrieval-Augmented Generation) para o database da Colet√¢nea.

## Exemplo m√≠nimo de RAG com LangChain + Ollama

Fluxo: instalar depend√™ncias, localizar `database.db`, carregar hinos, gerar embedding/chroma simples e testar uma consulta.

> Certifique-se de que o servidor do Ollama est√° rodando e que o modelo de embedding/LLM (ex.: `nomic-embed-text`, `llama3`) est√° baixado.

## Checklist r√°pido antes de rodar
- Suba o servidor do Ollama e garanta que os modelos `nomic-embed-text` e `gemma3:4b` est√£o baixados.
- Confirme o caminho do `database.db` (as rotas j√° tentam achar na raiz do repo).
- Instale as depend√™ncias na c√©lula seguinte (comente/descomente o `%pip`).
- Rode as c√©lulas na ordem; a primeira execu√ß√£o cria os caches/chroma em `shared/rag/` e reaproveita nas pr√≥ximas.
- Ajuste `max_rows` ou par√¢metros de chunking/indexa√ß√£o se quiser limitar dados para testes r√°pidos.

In [None]:
# Instalando libs necess√°rias
# %pip install -q langchain langchain-community langchain-ollama langchain-chroma chromadb sqlite-utils pydantic rank-bm25 difflib python-Levenshtein

In [None]:
from pathlib import Path
import sqlite3

# Localiza o database.db a partir deste notebook
candidates = [
    Path.cwd() / "database" / "database.db",
    Path.cwd().parent / "database" / "database.db",
    Path.cwd().parent.parent / "database" / "database.db",
]
db_path = next((p for p in candidates if p.exists()), None)
if not db_path:
    raise FileNotFoundError("database.db n√£o encontrado; ajuste o caminho na lista de candidates.")

print(f"Usando DB: {db_path}")

with sqlite3.connect(db_path) as conn:
    cur = conn.cursor()
    cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [r[0] for r in cur.fetchall()]
    cur.execute("SELECT count(*) FROM hino")
    total_hinos = cur.fetchone()[0]

print(f"Tabelas: {tables}")
print(f"Total de hinos: {total_hinos}")

## Indexa√ß√£o: chunks e vectorstore
Esta etapa cria (ou reaproveita) os chunks dos hinos e o vectorstore Chroma.
- Usa cache em `shared/rag/chunks_cache.pkl` para evitar re-splitting.
- Persiste o Chroma em `shared/rag/vectorstore/`.
- Altere `max_rows`, `chunk_size` ou `chunk_overlap` no c√≥digo seguinte se precisar de execu√ß√µes mais leves.

In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from tqdm import tqdm
import pickle

# Configura√ß√£o de embedding
OLLAMA_EMBED_MODEL = "nomic-embed-text"
embeddings = OllamaEmbeddings(model=OLLAMA_EMBED_MODEL)

# Diret√≥rios de cache
vector_dir = Path.cwd().parent / "shared" / "rag" / "vectorstore"
chunks_cache = Path.cwd().parent / "shared" / "rag" / "chunks_cache.pkl"

# ===== CARREGAR CHUNKS DO CACHE PRIMEIRO =====
if chunks_cache.exists():
    print(f"‚úì Carregando chunks do cache: {chunks_cache}")
    with open(chunks_cache, "rb") as f:
        chunks = pickle.load(f)
    print(f"‚úì {len(chunks)} chunks carregados do cache")
else:
    # Se n√£o tem cache, carrega hinos e cria chunks
    print(f"‚ö† Cache de chunks n√£o encontrado. Criando do zero...")

    # Carrega hinos do banco
    max_rows = total_hinos
    with sqlite3.connect(db_path) as conn:
        cur = conn.cursor()
        cur.execute(
            """
            SELECT id, nome, numero, texto_limpo AS texto, 
                   categoria_id, coletanea_id
            FROM hino
            WHERE texto_limpo IS NOT NULL
            ORDER BY id
            LIMIT ?
            """,
            (max_rows,),
        )
        rows = cur.fetchall()

    print(f"‚úì {len(rows)} hinos carregados do DB")

    # Cria documentos
    docs = []
    for hid, nome, numero, texto, categoria_id, coletanea_id in tqdm(rows, desc="Criando documentos"):
        if not texto:
            continue
        content = f"{nome or ''} ({numero or ''})\n\n{texto.strip()}"
        docs.append(
            Document(
                page_content=content,
                metadata={
                    "hino_id": hid,
                    "nome": nome,
                    "numero": numero,
                    "categoria_id": categoria_id,
                    "coletanea_id": coletanea_id,
                },
            )
        )

    # Cria chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
    chunks = []
    for doc in tqdm(docs, desc="Criando chunks"):
        chunks.extend(splitter.split_documents([doc]))
    print(f"‚úì {len(chunks)} chunks criados de {len(docs)} hinos")

    # Salva chunks no cache
    print(f"üíæ Salvando chunks em: {chunks_cache}")
    with open(chunks_cache, "wb") as f:
        pickle.dump(chunks, f)
    print(f"‚úì Chunks salvos com sucesso!")

# ===== CARREGAR/CRIAR VECTORSTORE =====
if vector_dir.exists() and (vector_dir / "chroma.sqlite3").exists():
    print(f"‚úì Carregando vectorstore de: {vector_dir}")
    vectorstore = Chroma(embedding_function=embeddings, persist_directory=str(vector_dir))
    print(f"‚úì Vectorstore carregado com sucesso!")
else:
    print(f"‚ö† Vectorstore n√£o encontrado. Criando do zero...")
    print(f"üî® Criando vectorstore em: {vector_dir}")
    vectorstore = Chroma(embedding_function=embeddings, persist_directory=str(vector_dir))
    batch_size = 64
    for i in tqdm(range(0, len(chunks), batch_size), desc="Indexando"):
        batch = chunks[i : i + batch_size]
        vectorstore.add_documents(batch)
    print(f"‚úì Vectorstore criado e salvo!")

print(f"\n‚úÖ Setup completo! Vectorstore ({len(chunks)} chunks) pronto para uso.")

## Configura√ß√£o de Filtros e Metadados

Carrega categorias e colet√¢neas dispon√≠veis no banco para uso em filtros din√¢micos.

In [None]:
# Busca categorias e colet√¢neas dispon√≠veis para filtros
with sqlite3.connect(db_path) as conn:
    cur = conn.cursor()
    cur.execute("SELECT id, descricao FROM categoria")
    categorias = {row[1].lower(): row[0] for row in cur.fetchall()}
    
    cur.execute("SELECT id, nome FROM coletanea")
    coletaneas = {row[1].lower(): row[0] for row in cur.fetchall()}

print(f"‚úì Categorias: {list(categorias.keys())}")
print(f"‚úì Colet√¢neas: {list(coletaneas.keys())}")

## Configura√ß√£o de Retrievers e LLM

Configura o LLM, retrievers (vetorial MMR + BM25 h√≠brido) e prompts base.

In [None]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.retrievers import BM25Retriever
import re

# LLM principal
llm = OllamaLLM(model="gemma3:4b", temperature=0.3)

# Stopwords personalizadas
stopwords_path = Path.cwd().parent / "etl-similarity" / "assets" / "stopwords-br.txt"
with open(stopwords_path, encoding="utf-8") as f:
    STOPWORDS = {
        line.strip().strip('"')
        for line in f
        if line.strip() and not line.startswith("#")
    }

# Tokenizador para BM25
WORD_RE = re.compile(r"\w+")
def bm25_tokenizer(text: str):
    tokens = WORD_RE.findall(text.lower())
    return [t for t in tokens if t not in STOPWORDS]

# Retrievers
vector_retriever = vectorstore.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 8, "fetch_k": 20}
)

# BM25 (apenas se chunks dispon√≠veis)
if chunks:
    bm25_retriever = BM25Retriever.from_documents(chunks, preprocess_func=bm25_tokenizer)
    bm25_retriever.k = 8
    print("‚úì BM25 retriever configurado")
else:
    bm25_retriever = None
    print("‚ö† BM25 n√£o dispon√≠vel (chunks n√£o carregados)")

# Prompts base
rewrite_system_str = """
Reescreva a consulta do usu√°rio para busca em hinos, expandindo com sin√¥nimos e termos relacionados, 
mantendo inten√ß√£o e concis√£o. Mantenha as palavras que estiverem entre aspas. 
N√£o adicione explica√ß√µes sobre as altera√ß√µes na consulta.
"""
rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", rewrite_system_str),
    ("user", "Consulta: {question}"),
])
rewrite_chain = rewrite_prompt | llm | StrOutputParser()

def format_docs(docs):
    parts = []
    for d in docs:
        parts.append(f"[{d.metadata.get('numero') or 'N/A'}] {d.metadata.get('nome')}\n{d.page_content}")
    return "\n\n".join(parts)

answer_system_str = """
Voc√™ √© um assistente que responde apenas com base na colet√¢nea de hinos. 
√â prefer√≠vel retornar mais de uma op√ß√£o, pelo menos tr√™s, quando dispon√≠vel.
Explique os motivos de selecionar tais hinos. 
Cite n√∫meros (se houver) e t√≠tulos.
Se n√£o souber, diga que n√£o est√° na base.
"""

answer_prompt = ChatPromptTemplate.from_messages([
    ("system", answer_system_str),
    ("user", "Pergunta original: {question}\n\nConsulta reescrita: {rewritten}\n\nContexto:\n{context}\n\nResposta:"),
])

print("‚úì LLM e retrievers configurados!")

## Sistema de Busca com Filtros Din√¢micos

Implementa busca h√≠brida (vetorial + BM25) com extra√ß√£o autom√°tica de filtros de categoria/colet√¢nea via LLM.

In [None]:
from typing import Optional, List, Dict
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
import unicodedata
import requests
import re

# ==== Utilidades para refer√™ncias b√≠blicas ====
def _normalize_text(text: str) -> str:
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c)).lower().strip()

BIBLE_BOOK_MAP: Dict[str, str] = {
    "genesis": "Genesis",
    "exodo": "Exodus",
    "levitico": "Leviticus",
    "numeros": "Numbers",
    "deuteronomio": "Deuteronomy",
    "josue": "Joshua",
    "juizes": "Judges",
    "rute": "Ruth",
    "1samuel": "1 Samuel",
    "2samuel": "2 Samuel",
    "1reis": "1 Kings",
    "2reis": "2 Kings",
    "1cronicas": "1 Chronicles",
    "2cronicas": "2 Chronicles",
    "esdras": "Ezra",
    "neemias": "Nehemiah",
    "ester": "Esther",
    "jo": "Job",
    "salmos": "Psalms",
    "proverbios": "Proverbs",
    "eclesiastes": "Ecclesiastes",
    "cantico": "Song of Solomon",
    "canticos": "Song of Solomon",
    "cantares": "Song of Solomon",
    "isaias": "Isaiah",
    "jeremias": "Jeremiah",
    "lamentacoes": "Lamentations",
    "ezequiel": "Ezekiel",
    "daniel": "Daniel",
    "oseias": "Hosea",
    "joel": "Joel",
    "amos": "Amos",
    "obadias": "Obadiah",
    "jonas": "Jonah",
    "miqueias": "Micah",
    "naum": "Nahum",
    "habacuque": "Habakkuk",
    "sofonias": "Zephaniah",
    "ageu": "Haggai",
    "zacarias": "Zechariah",
    "malaquias": "Malachi",
    "mateus": "Matthew",
    "marcos": "Mark",
    "lucas": "Luke",
    "joao": "John",
    "atos": "Acts",
    "romanos": "Romans",
    "1corintios": "1 Corinthians",
    "2corintios": "2 Corinthians",
    "galatas": "Galatians",
    "efesios": "Ephesians",
    "filipenses": "Philippians",
    "colossenses": "Colossians",
    "1tessalonicenses": "1 Thessalonians",
    "2tessalonicenses": "2 Thessalonians",
    "1timoteo": "1 Timothy",
    "2timoteo": "2 Timothy",
    "tito": "Titus",
    "filemom": "Philemon",
    "hebreus": "Hebrews",
    "tiago": "James",
    "1pedro": "1 Peter",
    "2pedro": "2 Peter",
    "1joao": "1 John",
    "2joao": "2 John",
    "3joao": "3 John",
    "judas": "Jude",
    "apocalipse": "Revelation",
}

REF_RE = re.compile(
    r"(?i)([1-3]?\s?[A-Za-z√Ä-√ø√ß√£√µ√¢√™√¥√°√©√≠√≥√∫]+(?:\s+dos\s+canticos|\s+de\s+canticos|\s+dos\s+reis|\s+cronicas|\s+corintios|\s+tessalonicenses|\s+pedro|\s+joao)*)\s+(\d{1,3})(?:[:\.](\d{1,3})(?:-(\d{1,3}))?)?"
)

def _normalize_book_key(book: str) -> str:
    return _normalize_text(book).replace(" ", "")


def extract_bible_refs(text: str) -> List[dict]:
    refs: List[dict] = []
    seen = set()
    for match in REF_RE.finditer(text or ""):
        book_raw = match.group(1)
        chapter = match.group(2)
        verse_start = match.group(3)
        verse_end = match.group(4)

        key = _normalize_book_key(book_raw)
        if key not in BIBLE_BOOK_MAP:
            continue

        api_book = BIBLE_BOOK_MAP[key]

        # Se n√£o h√° verso, √© um cap√≠tulo inteiro
        if verse_start is None:
            label = f"{book_raw.strip()} {chapter}"
            api_ref = f"{api_book} {chapter}"
        else:
            # Com verso (pode ter range)
            label = f"{book_raw.strip()} {chapter}:{verse_start}{('-' + verse_end) if verse_end else ''}"
            api_ref = f"{api_book} {chapter}:{verse_start}{('-' + verse_end) if verse_end else ''}"

        if api_ref in seen:
            continue
        seen.add(api_ref)
        refs.append(
            {
                "label": label,
                "api_ref": api_ref,
                "type": "chapter" if verse_start is None else "verse",
            }
        )
    return refs


def fetch_bible_verses(
    refs: List[dict], translation: str = "almeida", max_chars: int = 1200
) -> str:
    verses = []
    total_len = 0
    for ref in refs:
        try:
            resp = requests.get(
                f"https://bible-api.com/{requests.utils.quote(ref['api_ref'])}",
                params={"translation": translation},
                timeout=8,
            )
            if resp.status_code != 200:
                continue
            data = resp.json()
            text_parts = [v.get("text", "").strip() for v in data.get("verses", [])]
            verse_text = " ".join([t for t in text_parts if t])
            if not verse_text:
                continue

            # Para cap√≠tulos inteiros, trunca se for muito longo
            is_chapter = ref.get("type") == "chapter"
            if is_chapter and len(verse_text) > 800:
                verse_text = verse_text[:800] + "..."

            snippet = f"{ref['label']} ‚Äî {verse_text}"
            verses.append(snippet)
            total_len += len(snippet)
            if total_len >= max_chars:
                break
        except Exception:
            continue
    return "\n".join(verses)


In [None]:
from thefuzz import fuzz
import unicodedata

# ===== EXTRA√á√ÉO DETERMIN√çSTICA DE FILTROS =====

def _normalize_for_matching(text: str) -> str:
    """Normaliza texto para matching: remove acentos, converte a min√∫sculas, remove espa√ßos extras."""
    if not text:
        return ""
    # Remove acentos
    nfkd = unicodedata.normalize("NFKD", text)
    text_no_accents = "".join(c for c in nfkd if not unicodedata.combining(c))
    # Min√∫sculas e espa√ßo √∫nico
    return " ".join(text_no_accents.lower().split())


def _find_matches_in_text(query: str, candidates: list) -> list:
    """
    Busca candidatos dentro da query usando estrat√©gia h√≠brida:
    1. Substring match exato (normalizado) - score 1.0
    2. Partial token match - score baseado em tokens comuns
    3. Fuzzy matching (thefuzz) como fallback

    Retorna lista de (candidate, score) ordenada por score.
    """
    matches = []
    query_norm = _normalize_for_matching(query)
    query_tokens = set(query_norm.split())

    for candidate in candidates:
        candidate_norm = _normalize_for_matching(candidate)

        # Estrat√©gia 1: Substring match direto (melhor caso)
        if candidate_norm in query_norm:
            matches.append((candidate, 1.0))
            continue

        # Estrat√©gia 2: Match por tokens (bom para varia√ß√µes de ordem)
        candidate_tokens = set(candidate_norm.split())
        if len(candidate_tokens) == 0:
            continue

        # Calcula overlap de tokens
        common_tokens = query_tokens & candidate_tokens
        token_ratio = len(common_tokens) / len(candidate_tokens)

        # Se a maioria dos tokens da categoria est√° na query, considera match
        if token_ratio >= 0.7:
            matches.append((candidate, token_ratio))
            continue

        # Estrat√©gia 3: Fuzzy matching como fallback (para typos)
        # Usa partial_ratio que √© ideal para encontrar substring fuzzy
        if len(candidate_norm.split()) <= 4:
            ratio = fuzz.partial_ratio(candidate_norm, query_norm) / 100.0
            # Threshold mais alto para fuzzy, pois j√° falhou nos outros m√©todos
            if ratio >= 0.6:
                matches.append((candidate, ratio * 0.8))  # Penaliza fuzzy match

    # Ordena por score (descendente)
    return sorted(matches, key=lambda x: x[1], reverse=True)


def _extract_filters_deterministic(question: str, categorias_dict: dict, coletaneas_dict: dict) -> dict:
    """
    Extrai filtros de forma determin√≠stica sem usar LLM.
    
    Estrat√©gia:
    1. Busca categorias/colet√¢neas contidas na query (substring + token matching)
    2. Remove as refer√™ncias encontradas da query
    3. Retorna filtros e query limpa
    """
    question_lower = question.lower()
    found_categorias = []
    found_coletaneas = []
    
    # Extrai poss√≠veis categorias
    categoria_names = list(categorias_dict.keys())
    cat_matches = _find_matches_in_text(question_lower, categoria_names)
    
    # Aceita matches com score >= 0.7 (alta confian√ßa)
    for cat_name, score in cat_matches:
        if score >= 0.7:
            found_categorias.append(cat_name)
            # Remove a refer√™ncia da pergunta (para limpeza)
            question_lower = question_lower.replace(cat_name.lower(), " ")
    
    # Extrai poss√≠veis colet√¢neas
    coletanea_names = list(coletaneas_dict.keys())
    col_matches = _find_matches_in_text(question_lower, coletanea_names)
    
    for col_name, score in col_matches:
        if score >= 0.7:
            found_coletaneas.append(col_name)
            # Remove a refer√™ncia da pergunta
            question_lower = question_lower.replace(col_name.lower(), " ")
    
    # Limpa a query removendo espa√ßos extras
    cleaned_query = " ".join(question_lower.split())
    
    return {
        "categorias": found_categorias if found_categorias else None,
        "coletaneas": found_coletaneas if found_coletaneas else None,
        "search_query": cleaned_query if cleaned_query else question,
        "matches_info": {
            "categorias_scores": cat_matches[:3],
            "coletaneas_scores": col_matches[:3],
        }
    }


print("‚úÖ Sistema determin√≠stico de extra√ß√£o de filtros carregado!")

In [None]:

def hybrid_retrieve_filtered_with_filters(search_query: str, filters: dict) -> list:
    """
    Busca h√≠brida (vetorial + BM25) com filtros de categoria/colet√¢nea.
    
    Os filtros s√£o aplicados em l√≥gica de INTERSEC√á√ÉO (AND):
    - Se h√° filtro de categoria, apenas hinos nessas categorias s√£o retornados
    - Se h√° filtro de colet√¢nea, apenas hinos nessas colet√¢neas s√£o retornados
    """
    # Coleta IDs de categorias/colet√¢neas
    categoria_ids = []
    if filters.get("categorias"):
        for cat_name in filters["categorias"]:
            cat_id = categorias.get(cat_name.lower())
            if cat_id:
                categoria_ids.append(cat_id)
    
    coletanea_ids = []
    if filters.get("coletaneas"):
        for col_name in filters["coletaneas"]:
            col_id = coletaneas.get(col_name.lower())
            if col_id:
                coletanea_ids.append(col_id)
    
    if categoria_ids or coletanea_ids:
        print(f"üîç Filtros aplicados (INTERSEC√á√ÉO): categorias={categoria_ids}, coletaneas={coletanea_ids}")
    
    # Helper para verificar se doc satisfaz os filtros (intersec√ß√£o/AND)
    def matches_filters(doc) -> bool:
        # Se temos filtro de categoria, doc deve estar em uma das categorias
        if categoria_ids:
            if doc.metadata.get("categoria_id") not in categoria_ids:
                return False
        
        # Se temos filtro de colet√¢nea, doc deve estar em uma das colet√¢neas
        if coletanea_ids:
            if doc.metadata.get("coletanea_id") not in coletanea_ids:
                return False
        
        return True
    
    # Busca vetorial
    vec_docs = []
    if categoria_ids or coletanea_ids:
        # Estrat√©gia: fazer m√∫ltiplas buscas para cobrir todas as combina√ß√µes
        seen_ids = set()
        
        if categoria_ids and coletanea_ids:
            # Buscar todas as combina√ß√µes categoria x colet√¢nea
            for cat_id in categoria_ids:
                for col_id in coletanea_ids:
                    try:
                        docs = vectorstore.similarity_search(
                            search_query, k=15, 
                            filter={"categoria_id": cat_id, "coletanea_id": col_id}
                        )
                        for doc in docs:
                            hid = doc.metadata.get("hino_id")
                            if hid not in seen_ids:
                                seen_ids.add(hid)
                                vec_docs.append(doc)
                    except:
                        pass
        elif categoria_ids:
            # Apenas categorias: buscar com cada categoria
            for cat_id in categoria_ids:
                try:
                    docs = vectorstore.similarity_search(
                        search_query, k=15, filter={"categoria_id": cat_id}
                    )
                    for doc in docs:
                        hid = doc.metadata.get("hino_id")
                        if hid not in seen_ids:
                            seen_ids.add(hid)
                            vec_docs.append(doc)
                except:
                    pass
        else:
            # Apenas colet√¢neas: buscar com cada colet√¢nea
            for col_id in coletanea_ids:
                try:
                    docs = vectorstore.similarity_search(
                        search_query, k=15, filter={"coletanea_id": col_id}
                    )
                    for doc in docs:
                        hid = doc.metadata.get("hino_id")
                        if hid not in seen_ids:
                            seen_ids.add(hid)
                            vec_docs.append(doc)
                except:
                    pass
        
        # Limita a 10 melhores (j√° ordenados por similaridade nas buscas individuais)
        vec_docs = vec_docs[:10]
        
        # Se n√£o achou nada com filtros, busca sem filtros e filtra manualmente
        if len(vec_docs) == 0:
            all_docs = vector_retriever.invoke(search_query)
            vec_docs = [d for d in all_docs if matches_filters(d)]
    else:
        vec_docs = vector_retriever.invoke(search_query)
    
    # BM25 com filtro manual (intersec√ß√£o)
    if bm25_retriever:
        bm25_docs = bm25_retriever.invoke(search_query)
        if categoria_ids or coletanea_ids:
            bm25_docs = [d for d in bm25_docs if matches_filters(d)]
    else:
        bm25_docs = []
    
    # Combina com deduplica√ß√£o
    seen = set()
    combined = []
    for doc in vec_docs:
        hid = doc.metadata.get('hino_id')
        if hid not in seen:
            seen.add(hid)
            combined.append(doc)
    
    for doc in bm25_docs:
        hid = doc.metadata.get('hino_id')
        if hid not in seen and len(combined) < 10:
            seen.add(hid)
            combined.append(doc)
    
    return combined[:10]


def answer_filtered(question: str, verbose: bool = True):
    """
    Responde pergunta com extra√ß√£o determin√≠stica de filtros.
    """
    bible_refs = extract_bible_refs(question)
    bible_context = fetch_bible_verses(bible_refs) if bible_refs else ""
    if verbose and bible_refs:
        print(f"üìñ Refer√™ncias b√≠blicas detectadas: {bible_refs}")
        if bible_context:
            print("üì• Texto b√≠blico recuperado (resumo):")
            print(bible_context[:240] + ("..." if len(bible_context) > 240 else ""))

    # Extra√ß√£o determin√≠stica de filtros
    filters = _extract_filters_deterministic(question, categorias, coletaneas)
    search_query = filters.get("search_query", question)
    
    if verbose:
        if filters.get("categorias") or filters.get("coletaneas"):
            print(f"üõ† Filtros detectados: categorias={filters.get('categorias')}, coletaneas={filters.get('coletaneas')}")
            print(f"   (scores: {filters['matches_info']})")
        else:
            print(f"üõ† Sem filtros detectados")
        print(f"üìù Query limpa: {search_query}")

    rewritten = rewrite_chain.invoke({"question": search_query})
    if verbose:
        print(f"üìù Consulta reescrita: {rewritten}")

    effective_query = rewritten
    if bible_context:
        # Usa o texto b√≠blico para enriquecer a busca no vector/BM25
        effective_query = rewritten + "\n\n" + '"' + bible_context[:700] + '"'
        if verbose:
            print("üîé Consulta efetiva enriquecida com texto b√≠blico")

    docs = hybrid_retrieve_filtered_with_filters(effective_query, filters)
    if verbose:
        print(f"üìö Hinos encontrados: {len(docs)}")
        print("üîé Hinos selecionados:")
        for doc in docs:
            print(f"- [{doc.metadata.get('numero') or 'N/A'}] {doc.metadata.get('nome')}")

    if not docs:
        return "‚ùå Nenhum hino encontrado com esses crit√©rios."

    context = format_docs(docs)
    if bible_context:
        context = context + "\n\nTrechos b√≠blicos fornecidos:\n" + '"' + bible_context + '"'
    filter_info = f"\nFiltros: {filters}" if (filters.get("categorias") or filters.get("coletaneas")) else ""

    final_prompt = answer_prompt.format(
        question=question,
        rewritten=rewritten + filter_info,
        context=context
    )

    if verbose:
        print(f"\nüí¨ Gerando resposta...")

    return llm.invoke(final_prompt)

print("‚úÖ Sistema de busca com filtros determin√≠sticos pronto!")

## Exemplos de Uso

Teste o sistema com diferentes consultas e filtros.

In [None]:
# Exemplo 1: Busca simples
print("=" * 60)
print("EXEMPLO 1: Busca por tema")
print("=" * 60)
resposta = answer_filtered("Hinos que combinam com o texto de Isa√≠as 4:6")
print(f"\n{resposta}\n")

In [None]:
# Exemplo 2: Com filtro de categoria
print("=" * 60)
print("EXEMPLO 2: Filtro de categoria")
print("=" * 60)
resposta = answer_filtered("Hinos da categoria de invoca√ß√£o e comunh√£o sobre unidade")
print(f"\n{resposta}\n")

In [None]:
# Exemplo 3: Com filtro de colet√¢nea
print("=" * 60)
print("EXEMPLO 3: Filtro de colet√¢nea")
print("=" * 60)
resposta = answer_filtered(
    "Hinos da colet√¢nea de louvores avulsos sobre Deus como uma \"torre forte\""
)
print(f"\n{resposta}\n")