# 💾 ARMAZENAMENTO QDRANT

Insere embeddings no banco vetorial Qdrant.

In [None]:
import os
import json
from pathlib import Path
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid

# Configuração
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.codrstudio.dev:6333")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "nic")

if not QDRANT_API_KEY:
    raise ValueError("QDRANT_API_KEY é obrigatório")

# Diretórios
embeddings_dir = Path("pipeline_data/embeddings")

print(f"Qdrant URL: {QDRANT_URL}")
print(f"Collection: {COLLECTION_NAME}")
print(f"API Key: ***{QDRANT_API_KEY[-4:] if len(QDRANT_API_KEY) > 4 else '***'}")

In [None]:
# Conectar ao Qdrant
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)

# Verificar conexão
collections = client.get_collections()
print(f"✅ Conectado ao Qdrant")
print(f"Collections existentes: {len(collections.collections)}")

for col in collections.collections:
    print(f"  - {col.name}")

In [None]:
# Carregar embeddings
embeddings_file = embeddings_dir / "embeddings.jsonl"

if not embeddings_file.exists():
    raise FileNotFoundError(f"Arquivo de embeddings não encontrado: {embeddings_file}")

embeddings_data = []
with open(embeddings_file, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        embeddings_data.append(data)

print(f"Embeddings carregados: {len(embeddings_data)}")

# Verificar dimensões
if embeddings_data:
    embedding_dim = len(embeddings_data[0]["embedding"])
    print(f"Dimensões do embedding: {embedding_dim}")
else:
    raise ValueError("Nenhum embedding encontrado")

In [None]:
# Criar ou verificar collection
collection_exists = False
try:
    collection_info = client.get_collection(COLLECTION_NAME)
    collection_exists = True
    print(f"Collection '{COLLECTION_NAME}' já existe")
    print(f"  Pontos: {collection_info.points_count}")
    print(f"  Vetores: {collection_info.vectors_count}")
except Exception:
    print(f"Collection '{COLLECTION_NAME}' não existe, criando...")

if not collection_exists:
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(
            size=embedding_dim,
            distance=Distance.COSINE
        )
    )
    print(f"✅ Collection '{COLLECTION_NAME}' criada")
else:
    print(f"✅ Collection '{COLLECTION_NAME}' verificada")

In [None]:
# Preparar pontos para inserção
points = []

for embedding_data in embeddings_data:
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding_data["embedding"],
        payload={
            "chunk_id": embedding_data["chunk_id"],
            "source_document": embedding_data["source_document"],
            "chunk_index": embedding_data["chunk_index"],
            "text": embedding_data["text"],
            "char_count": embedding_data["char_count"],
            "embedding_model": embedding_data["embedding_model"]
        }
    )
    points.append(point)

print(f"Pontos preparados: {len(points)}")

In [None]:
# Inserir pontos em lotes
BATCH_SIZE = 100
total_inserted = 0
total_batches = (len(points) + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Inserindo {len(points)} pontos em {total_batches} lotes...")

for i in range(0, len(points), BATCH_SIZE):
    batch = points[i:i + BATCH_SIZE]
    batch_num = (i // BATCH_SIZE) + 1
    
    try:
        operation_info = client.upsert(
            collection_name=COLLECTION_NAME,
            points=batch
        )
        
        total_inserted += len(batch)
        print(f"  Lote {batch_num}/{total_batches}: {len(batch)} pontos inseridos")
        
    except Exception as e:
        print(f"  ❌ Erro no lote {batch_num}: {str(e)}")

print(f"\n✅ Inserção concluída: {total_inserted} pontos")

In [None]:
# Verificar resultado final
final_info = client.get_collection(COLLECTION_NAME)

print(f"\n📊 Estado final da collection:")
print(f"  Nome: {COLLECTION_NAME}")
print(f"  Pontos: {final_info.points_count}")
print(f"  Vetores: {final_info.vectors_count}")
print(f"  Status: {final_info.status}")

# Teste de busca simples
if final_info.points_count > 0:
    print(f"\n🔍 Testando busca...")
    
    # Usar primeiro embedding como query de teste
    test_vector = embeddings_data[0]["embedding"]
    
    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=test_vector,
        limit=3
    )
    
    print(f"  Resultados encontrados: {len(search_result)}")
    for i, result in enumerate(search_result):
        score = result.score
        doc = result.payload.get("source_document", "unknown")
        text_preview = result.payload.get("text", "")[:50] + "..."
        print(f"    {i+1}. Score: {score:.3f}, Doc: {doc}")
        print(f"       Text: {text_preview}")
    
    print(f"✅ Busca funcionando corretamente")
else:
    print(f"⚠️ Collection vazia - nenhum ponto inserido")