In [143]:
import re
import os
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from pprint import pprint

In [149]:
DATA_PATH = "/home/enzo/Desktop/riza/data/raw"
MANUAL_COMPLETO = os.path.join(DATA_PATH, "galaxy_z_flip_7.pdf")
MANUAL_TESTE = os.path.join(DATA_PATH, "galaxy_z_flip_7_teste.pdf")

In [156]:
pipeline_options = PdfPipelineOptions()


converter = DocumentConverter(format_options={
    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})

result = converter.convert(MANUAL_TESTE)



2025-09-29 12:36:04,689 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-29 12:36:04,697 - INFO - Going to convert document batch...
2025-09-29 12:36:04,698 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-29 12:36:04,699 - INFO - Accelerator device: 'cpu'
2025-09-29 12:36:06,420 - INFO - Accelerator device: 'cpu'
2025-09-29 12:36:07,667 - INFO - Accelerator device: 'cpu'
2025-09-29 12:36:07,990 - INFO - Processing document galaxy_z_flip_7_teste.pdf
2025-09-29 12:37:12,695 - INFO - Finished converting document galaxy_z_flip_7_teste.pdf in 68.01 sec.


In [163]:
result_md = result.document.export_to_markdown()

In [186]:
pprint(result_md[:200])

('## USER GUIDE\n'
 '\n'
 'SM-F766B SM-F761B\n'
 '\n'
 '## Table of Contents\n'
 '\n'
 '| Getting started   | Getting '
 'started                                          | 87    | Multiwindow(Using '
 'multiple apps at once)   |\n'
 '|---------')


In [196]:
headers_to_split_on = [("##", "Header 2"), ("###", "Header 3")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

md_header_splits = markdown_splitter.split_text(result_md)

In [197]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)

intermediate_chunks = []
for split in md_header_splits:
    h2 = split.metadata.get('Header 2', '')
    h3 = split.metadata.get('Header 3', '')
   
    prefix = ""
    if h2:
        if h2.lower() not in split.page_content[:100].lower():
            prefix = f"[{h2}]"
            if h3:
                prefix += f" [{h3}]"
            prefix += "\n\n"
    

    content_with_context = prefix + split.page_content

    if len(split.page_content) > 600:
        split.page_content = content_with_context
        sub_chunks = text_splitter.split_documents([split])
        intermediate_chunks.extend(sub_chunks)
    else:
        split.page_content = content_with_context
        intermediate_chunks.append(split)

In [202]:
for chunk in intermediate_chunks:
    content = chunk.page_content
    
    # Remove comentários HTML
    content = re.sub(r'<!--.*?-->', '', content)
    
    # Remove /C seguido de dígitos
    content = re.sub(r'/C\d+', '', content)
    
    # Remove parênteses VAZIOS (mas mantém os com conteúdo)
    content = re.sub(r'\(\s*\)', '', content)
    
    # Normaliza espaços múltiplos
    content = re.sub(r'\s+', ' ', content)
    
    # Remove linhas vazias extras
    content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
    
    chunk.page_content = content.strip()

In [203]:
intermediate_chunks = [c for c in intermediate_chunks 
                      if len(c.page_content.strip()) > 50]

In [204]:
intermediate_chunks

[Document(metadata={'Header 2': 'Table of Contents'}, page_content='| Getting started | Getting started | 87 | Multiwindow(Using multiple apps at once) | |-------------------|----------------------------------------------------------|-------|--------------------------------------------| | 5 | Device layout and functions | 88 | Samsung Internet | | 12 | Charging the battery | 89 | SamsungWallet |'),
 Document(metadata={'Header 2': 'Table of Contents'}, page_content='| 18 | Nano-SIM card and eSIM | 92 | Samsung Health | | 21 | Turning the device on and off | 92 | Samsung Notes | | 22 | Initial setup | 93 | Samsung Members | | 22 | Using networks | 93 | Samsung Kids |'),
 Document(metadata={'Header 2': 'Table of Contents'}, page_content='| 22 | Samsung account | 94 | Samsung Global Goals | | 23 | Transferring data fromyourprevious device (Smart Switch) | 94 | Samsung TVPlus | | 24 | Understanding the screen | 94 | Samsung Find | | 34 | Notification panel | 94 | Samsung Shop |'),
 Document