# 🔪 SEGMENTAÇÃO EM CHUNKS

Divide o texto em segmentos para processamento de embeddings.

In [None]:
import os
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

# Configuração
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "500"))
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "100"))

# Diretórios
processed_dir = Path("pipeline_data/processed")
chunks_dir = Path("pipeline_data/chunks")
chunks_dir.mkdir(parents=True, exist_ok=True)

# Limpar diretório chunks
for f in chunks_dir.glob("*"):
    if f.is_file():
        f.unlink()

print(f"Chunk size: {CHUNK_SIZE} tokens")
print(f"Overlap: {CHUNK_OVERLAP} tokens")

In [None]:
# Configurar splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Encontrar arquivos de texto
text_files = list(processed_dir.glob("*_text.txt"))
print(f"Arquivos de texto encontrados: {len(text_files)}")

for text_file in text_files:
    print(f"  {text_file.name}")

In [None]:
# Processar cada arquivo
all_chunks = []
chunk_id = 0

for text_file in text_files:
    try:
        print(f"Processando: {text_file.name}")
        
        # Ler texto
        with open(text_file, "r", encoding="utf-8") as f:
            text_content = f.read()
        
        if not text_content.strip():
            print(f"  ⚠️ Arquivo vazio: {text_file.name}")
            continue
        
        # Dividir em chunks
        chunks = text_splitter.split_text(text_content)
        
        # Processar cada chunk
        for i, chunk_text in enumerate(chunks):
            chunk_data = {
                "chunk_id": chunk_id,
                "source_document": text_file.stem.replace("_text", ""),
                "chunk_index": i,
                "text": chunk_text.strip(),
                "char_count": len(chunk_text)
            }
            
            all_chunks.append(chunk_data)
            chunk_id += 1
        
        print(f"  ✅ {len(chunks)} chunks criados")
        
    except Exception as e:
        print(f"  ❌ Erro: {str(e)}")

print(f"\n📊 Total de chunks: {len(all_chunks)}")

In [None]:
# Salvar chunks
chunks_file = chunks_dir / "chunks.jsonl"

with open(chunks_file, "w", encoding="utf-8") as f:
    for chunk in all_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + "\n")

print(f"✅ Chunks salvos: {chunks_file}")

# Estatísticas
if all_chunks:
    avg_chars = sum(chunk["char_count"] for chunk in all_chunks) / len(all_chunks)
    max_chars = max(chunk["char_count"] for chunk in all_chunks)
    min_chars = min(chunk["char_count"] for chunk in all_chunks)
    
    print(f"\n📊 Estatísticas:")
    print(f"  Total chunks: {len(all_chunks)}")
    print(f"  Tamanho médio: {avg_chars:.0f} caracteres")
    print(f"  Tamanho mínimo: {min_chars} caracteres")
    print(f"  Tamanho máximo: {max_chars} caracteres")
    
    # Mostrar alguns chunks de exemplo
    print(f"\n📝 Exemplos de chunks:")
    for i, chunk in enumerate(all_chunks[:3]):
        preview = chunk["text"][:100] + "..." if len(chunk["text"]) > 100 else chunk["text"]
        print(f"  {i+1}. {preview} ({chunk['char_count']} chars)")