# 🧠 GERAÇÃO DE EMBEDDINGS

Converte chunks de texto em vetores usando BAAI/bge-m3.

In [1]:
import os
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
import numpy as np

# Configuração
MODEL_NAME = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3")
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "32"))

# Diretórios
chunks_dir = Path("pipeline_data/chunks")
embeddings_dir = Path("pipeline_data/embeddings")
embeddings_dir.mkdir(parents=True, exist_ok=True)

# Limpar diretório embeddings
for f in embeddings_dir.glob("*"):
    if f.is_file():
        f.unlink()

print(f"Modelo: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")

Modelo: BAAI/bge-m3
Batch size: 32


In [2]:
# Carregar modelo
print("Carregando modelo de embeddings...")
model = SentenceTransformer(MODEL_NAME)
embedding_dim = model.get_sentence_embedding_dimension()

print(f"✅ Modelo carregado: {MODEL_NAME}")
print(f"Dimensões: {embedding_dim}")

Carregando modelo de embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

✅ Modelo carregado: BAAI/bge-m3
Dimensões: 1024


In [4]:
# Carregar chunks
chunks_file = chunks_dir / "chunks.jsonl"

if not chunks_file.exists():
    raise FileNotFoundError(f"Arquivo de chunks não encontrado: {chunks_file}")

chunks = []
with open(chunks_file, "r", encoding="utf-8") as f:
    for line in f:
        chunk = json.loads(line)
        chunks.append(chunk)

print(f"Chunks carregados: {len(chunks)}")

# Mostrar alguns exemplos
for i, chunk in enumerate(chunks[:3]):
    preview = chunk["text"][:50] + "..." if len(chunk["text"]) > 50 else chunk["text"]
    print(f"  {i+1}. {preview}")

Chunks carregados: 322
  1. # Visão Geral do Self Checkout

## Introdução

Est...
  2. O objetivo desta documentação é descrever o fluxo ...
  3. - Processo de identificação do cliente via CPF ou ...


In [5]:
# Gerar embeddings
print("Gerando embeddings...")

# Extrair textos dos chunks
texts = [chunk["text"] for chunk in chunks]

# Gerar embeddings em lotes
embeddings = model.encode(
    texts,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    normalize_embeddings=True
)

print(f"✅ Embeddings gerados: {len(embeddings)}")
print(f"Formato: {embeddings.shape}")

Gerando embeddings...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

✅ Embeddings gerados: 322
Formato: (322, 1024)


In [6]:
# Combinar chunks com embeddings
chunks_with_embeddings = []

for chunk, embedding in zip(chunks, embeddings):
    chunk_with_embedding = {
        "chunk_id": chunk["chunk_id"],
        "source_document": chunk["source_document"],
        "chunk_index": chunk["chunk_index"],
        "text": chunk["text"],
        "char_count": chunk["char_count"],
        "embedding": embedding.tolist(),
        "embedding_model": MODEL_NAME,
        "embedding_dimensions": len(embedding)
    }
    chunks_with_embeddings.append(chunk_with_embedding)

print(f"Chunks com embeddings: {len(chunks_with_embeddings)}")

Chunks com embeddings: 322


In [7]:
# Salvar embeddings
embeddings_file = embeddings_dir / "embeddings.jsonl"

with open(embeddings_file, "w", encoding="utf-8") as f:
    for chunk_data in chunks_with_embeddings:
        f.write(json.dumps(chunk_data, ensure_ascii=False) + "\n")

print(f"✅ Embeddings salvos: {embeddings_file}")

# Estatísticas
total_size_mb = (len(chunks_with_embeddings) * embedding_dim * 4) / (1024 * 1024)  # float32
avg_magnitude = np.mean([np.linalg.norm(chunk["embedding"]) for chunk in chunks_with_embeddings])

print(f"\n📊 Estatísticas:")
print(f"  Total embeddings: {len(chunks_with_embeddings)}")
print(f"  Dimensões: {embedding_dim}")
print(f"  Tamanho total: {total_size_mb:.1f} MB")
print(f"  Magnitude média: {avg_magnitude:.3f}")
print(f"  Modelo: {MODEL_NAME}")

✅ Embeddings salvos: pipeline_data/embeddings/embeddings.jsonl

📊 Estatísticas:
  Total embeddings: 322
  Dimensões: 1024
  Tamanho total: 1.3 MB
  Magnitude média: 1.000
  Modelo: BAAI/bge-m3
