# üè¢ RAG: Informa√ß√µes sobre a Processa

Este notebook implementa consulta sem√¢ntica ao QDrant para obter informa√ß√µes sobre a Processa Sistemas.

## üìã Funcionalidades

- Conex√£o com QDrant
- Gera√ß√£o de embeddings para query
- Busca sem√¢ntica com reranking
- Formata√ß√£o de resposta estruturada

## 1Ô∏è‚É£ Importa√ß√µes e Configura√ß√£o

In [None]:
import os
import json
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance, Filter, FieldCondition, MatchValue
from sentence_transformers import SentenceTransformer
import numpy as np
from dataclasses import dataclass, asdict
import torch

In [None]:
# Carrega configura√ß√µes do ambiente
env_file = Path('.env.development')
if not env_file.exists():
    env_file = Path('.env')
    
load_dotenv(env_file)

# Configura√ß√£o do QDrant
QDRANT_URL = os.getenv('QDRANT_URL', 'https://qdrant.codrstudio.dev/')
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'nic_dev')

# Modelo de embeddings
EMBEDDING_MODEL = 'BAAI/bge-m3'

print(f"üì° QDrant URL: {QDRANT_URL}")
print(f"üì¶ Collection: {QDRANT_COLLECTION}")
print(f"üß† Modelo: {EMBEDDING_MODEL}")

## 2Ô∏è‚É£ Classe de Busca RAG

In [None]:
@dataclass
class SearchResult:
    """Resultado de busca estruturado"""
    text: str
    score: float
    metadata: Dict[str, Any]
    source: str
    
class ProcessaInfoRAG:
    """Sistema RAG para informa√ß√µes sobre a Processa"""
    
    def __init__(self):
        """Inicializa conex√µes e modelo"""
        # Conex√£o QDrant
        self.client = QdrantClient(
            url=QDRANT_URL,
            api_key=QDRANT_API_KEY,
            timeout=30
        )
        
        # Modelo de embeddings
        self.model = SentenceTransformer(EMBEDDING_MODEL)
        self.model.eval()
        
        # Configura√ß√µes de busca
        self.top_k = 10
        self.score_threshold = 0.5
        
    def generate_embedding(self, text: str) -> np.ndarray:
        """Gera embedding para texto"""
        with torch.no_grad():
            embedding = self.model.encode(
                text,
                normalize_embeddings=True,
                show_progress_bar=False
            )
        return embedding
    
    def search_qdrant(self, query: str, limit: int = 10) -> List[SearchResult]:
        """Busca sem√¢ntica no QDrant"""
        # Gera embedding da query
        query_vector = self.generate_embedding(query).tolist()
        
        # Busca no QDrant
        results = self.client.search(
            collection_name=QDRANT_COLLECTION,
            query_vector=query_vector,
            limit=limit,
            with_payload=True
        )
        
        # Formata resultados
        search_results = []
        for hit in results:
            if hit.score >= self.score_threshold:
                result = SearchResult(
                    text=hit.payload.get('text', ''),
                    score=hit.score,
                    metadata=hit.payload.get('metadata', {}),
                    source=hit.payload.get('source', 'unknown')
                )
                search_results.append(result)
                
        return search_results
    
    def rerank_results(self, query: str, results: List[SearchResult]) -> List[SearchResult]:
        """Reordena resultados por relev√¢ncia"""
        if not results:
            return results
            
        # Calcula scores de cross-encoding para reranking
        texts = [r.text for r in results]
        pairs = [[query, text] for text in texts]
        
        # Usa o modelo para calcular similaridade
        with torch.no_grad():
            query_embedding = self.model.encode(query, normalize_embeddings=True)
            text_embeddings = self.model.encode(texts, normalize_embeddings=True)
            
            # Calcula similaridade cosseno
            similarities = np.dot(text_embeddings, query_embedding)
        
        # Combina scores originais com reranking
        for i, result in enumerate(results):
            combined_score = (result.score * 0.7) + (similarities[i] * 0.3)
            result.score = float(combined_score)
        
        # Reordena por score combinado
        results.sort(key=lambda x: x.score, reverse=True)
        
        return results
    
    def get_processa_info(self, query: str = "Quem √© a Processa Sistemas") -> Dict[str, Any]:
        """Obt√©m informa√ß√µes sobre a Processa"""
        try:
            # Busca inicial
            results = self.search_qdrant(query, limit=self.top_k)
            
            if not results:
                return {
                    "status": "no_results",
                    "message": "Nenhuma informa√ß√£o encontrada sobre a Processa",
                    "query": query,
                    "timestamp": datetime.now().isoformat()
                }
            
            # Reranking
            results = self.rerank_results(query, results)
            
            # Extrai informa√ß√µes principais
            top_results = results[:5]  # Top 5 mais relevantes
            
            # Monta resposta estruturada
            response = {
                "status": "success",
                "query": query,
                "company_info": {
                    "name": "Processa Sistemas",
                    "description": self._extract_description(top_results),
                    "key_points": self._extract_key_points(top_results),
                    "sources": self._extract_sources(top_results)
                },
                "search_metadata": {
                    "total_results": len(results),
                    "top_score": float(results[0].score) if results else 0,
                    "collection": QDRANT_COLLECTION,
                    "model": EMBEDDING_MODEL
                },
                "timestamp": datetime.now().isoformat()
            }
            
            return response
            
        except Exception as e:
            return {
                "status": "error",
                "message": str(e),
                "query": query,
                "timestamp": datetime.now().isoformat()
            }
    
    def _extract_description(self, results: List[SearchResult]) -> str:
        """Extrai descri√ß√£o consolidada"""
        if not results:
            return "Informa√ß√£o n√£o dispon√≠vel"
        
        # Pega o texto mais relevante
        main_text = results[0].text
        
        # Limita tamanho
        if len(main_text) > 500:
            main_text = main_text[:497] + "..."
            
        return main_text
    
    def _extract_key_points(self, results: List[SearchResult]) -> List[str]:
        """Extrai pontos principais"""
        key_points = []
        seen = set()
        
        for result in results:
            # Extrai frases importantes
            sentences = result.text.split('.')
            for sentence in sentences[:2]:  # Primeiras 2 frases
                sentence = sentence.strip()
                if sentence and len(sentence) > 20 and sentence not in seen:
                    key_points.append(sentence)
                    seen.add(sentence)
                    
                if len(key_points) >= 5:
                    break
                    
            if len(key_points) >= 5:
                break
                
        return key_points
    
    def _extract_sources(self, results: List[SearchResult]) -> List[Dict[str, Any]]:
        """Extrai fontes dos documentos"""
        sources = []
        seen_sources = set()
        
        for result in results[:5]:
            source_name = result.source
            if source_name not in seen_sources:
                sources.append({
                    "document": source_name,
                    "relevance_score": float(result.score),
                    "metadata": result.metadata
                })
                seen_sources.add(source_name)
                
        return sources

## 3Ô∏è‚É£ Fun√ß√µes para Integra√ß√£o com API

In [None]:
# Inst√¢ncia global para reutiliza√ß√£o
_rag_instance = None

def get_rag_instance() -> ProcessaInfoRAG:
    """Retorna inst√¢ncia singleton do RAG"""
    global _rag_instance
    if _rag_instance is None:
        _rag_instance = ProcessaInfoRAG()
    return _rag_instance

def get_processa_info_api(custom_query: Optional[str] = None) -> Dict[str, Any]:
    """Fun√ß√£o principal para uso em API"""
    rag = get_rag_instance()
    
    # Query padr√£o ou customizada
    query = custom_query or "Quem √© a Processa Sistemas"
    
    # Busca informa√ß√µes
    result = rag.get_processa_info(query)
    
    return result

def search_processa_documents(query: str, limit: int = 5) -> Dict[str, Any]:
    """Busca gen√©rica em documentos sobre a Processa"""
    rag = get_rag_instance()
    
    try:
        # Busca
        results = rag.search_qdrant(query, limit=limit * 2)
        
        # Reranking
        results = rag.rerank_results(query, results)
        
        # Formata resposta
        return {
            "status": "success",
            "query": query,
            "results": [
                {
                    "text": r.text[:300] + "..." if len(r.text) > 300 else r.text,
                    "score": float(r.score),
                    "source": r.source,
                    "metadata": r.metadata
                }
                for r in results[:limit]
            ],
            "total_found": len(results),
            "timestamp": datetime.now().isoformat()
        }
        
    except Exception as e:
        return {
            "status": "error",
            "message": str(e),
            "query": query,
            "timestamp": datetime.now().isoformat()
        }

## 4Ô∏è‚É£ Teste do Sistema

In [None]:
def test_processa_rag():
    """Testa o sistema RAG"""
    print("üß™ Testando RAG Processa Info\n")
    print("=" * 60)
    
    # Teste 1: Query padr√£o
    print("\nüìç Teste 1: Query padr√£o")
    result = get_processa_info_api()
    
    if result['status'] == 'success':
        print("‚úÖ Busca realizada com sucesso!")
        print(f"\nüìù Descri√ß√£o encontrada:")
        print(result['company_info']['description'][:200] + "...")
        print(f"\nüéØ Pontos principais: {len(result['company_info']['key_points'])} encontrados")
        print(f"üìö Fontes: {len(result['company_info']['sources'])} documentos")
    else:
        print(f"‚ùå Erro: {result.get('message', 'Desconhecido')}")
    
    # Teste 2: Query customizada
    print("\n" + "=" * 60)
    print("\nüìç Teste 2: Query customizada")
    custom_result = search_processa_documents("servi√ßos da Processa", limit=3)
    
    if custom_result['status'] == 'success':
        print("‚úÖ Busca customizada realizada!")
        print(f"\nüìä Resultados encontrados: {custom_result['total_found']}")
        print(f"üîù Top {len(custom_result['results'])} resultados retornados")
        
        if custom_result['results']:
            print(f"\nü•á Melhor resultado (score: {custom_result['results'][0]['score']:.3f}):")
            print(custom_result['results'][0]['text'][:150] + "...")
    else:
        print(f"‚ùå Erro: {custom_result.get('message', 'Desconhecido')}")
    
    print("\n" + "=" * 60)
    print("\n‚ú® Testes conclu√≠dos!")

# Executa teste
if __name__ == "__main__":
    test_processa_rag()

## 5Ô∏è‚É£ Exporta√ß√£o de Fun√ß√µes para API

As fun√ß√µes abaixo s√£o exportadas para uso no notebook `rest-api.ipynb`:

In [None]:
# Fun√ß√µes p√∫blicas para API
__all__ = [
    'get_processa_info_api',
    'search_processa_documents',
    'ProcessaInfoRAG'
]

print("‚úÖ Notebook RAG Processa Info carregado com sucesso!")
print("\nüìå Fun√ß√µes dispon√≠veis:")
print("  - get_processa_info_api(): Obt√©m informa√ß√µes sobre a Processa")
print("  - search_processa_documents(query, limit): Busca gen√©rica em documentos")
print("\nüîó Para integrar com API, importe este notebook no rest-api.ipynb")