# ⚙️ PROCESSAMENTO DOCLING

Extrai texto dos documentos usando Docling.

In [None]:
from pathlib import Path
from docling.document_converter import DocumentConverter

# Diretórios
docs_dir = Path("pipeline_data/documents")
processed_dir = Path("pipeline_data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

# Limpar diretório processado
for f in processed_dir.glob("*"):
    if f.is_file():
        f.unlink()

# Listar documentos
documents = list(docs_dir.glob("*"))
documents = [f for f in documents if f.is_file()]

print(f"Documentos encontrados: {len(documents)}")
for doc in documents:
    print(f"  {doc.name}")

Documentos encontrados: 0


In [None]:
# Processar documentos
converter = DocumentConverter()
processed_count = 0
errors = []

for doc_path in documents:
    try:
        print(f"Processando: {doc_path.name}")
        
        # Converter documento
        result = converter.convert(doc_path)
        
        # Extrair texto
        text_content = result.document.export_to_text()
        
        # Salvar texto extraído
        text_file = processed_dir / f"{doc_path.stem}_text.txt"
        with open(text_file, "w", encoding="utf-8") as f:
            f.write(text_content)
        
        processed_count += 1
        print(f"  ✅ Texto extraído: {len(text_content)} caracteres")
        
    except Exception as e:
        error_msg = f"{doc_path.name}: {str(e)}"
        errors.append(error_msg)
        print(f"  ❌ Erro: {str(e)}")

print(f"\n📊 Resumo:")
print(f"  Processados: {processed_count}")
print(f"  Erros: {len(errors)}")

if errors:
    print(f"\n❌ Erros:")
    for error in errors:
        print(f"  {error}")