In [10]:
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from dotenv import load_dotenv
import os

In [2]:
from docling_core.types.doc import ImageRefMode

In [None]:
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from langchain_openai import OpenAIEmbeddings
from langchain_milvus import Milvus
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

True

In [5]:
_log = logging.getLogger(__name__)

In [6]:
USE_V2 = True
USE_LEGACY = False

In [7]:
def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
):
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0
    partial_success_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            if USE_V2:
                conv_res.document.save_as_markdown(
                    output_dir / f"{doc_filename}.md",
                    image_mode=ImageRefMode.PLACEHOLDER,
                )

                # Export Docling document format to markdown:
                with (output_dir / f"{doc_filename}.md").open("w") as fp:
                    fp.write(conv_res.document.export_to_markdown())
                    
                _log.info(f"Saved: {doc_filename}.md")

            if USE_LEGACY:
                
                # Export Markdown format:
                with (output_dir / f"{doc_filename}.legacy.md").open(
                    "w", encoding="utf-8"
                ) as fp:
                    fp.write(conv_res.legacy_document.export_to_markdown())

                _log.info(f"Saved: {doc_filename}.md")
                
        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
            _log.info(
                f"Document {conv_res.input.file} was partially converted with the following errors:"
            )
            for item in conv_res.errors:
                _log.info(f"\t{item.error_message}")
            partial_success_count += 1
        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + partial_success_count + failure_count} docs, "
        f"of which {failure_count} failed "
        f"and {partial_success_count} were partially converted."
    )
    return success_count, partial_success_count, failure_count

In [8]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_paths = [
        Path("./manuals/Volkswagen_Polo_2025.pdf"),
        Path("./manuals/Fiat_Argo_2023.pdf")
    ]

    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_page_images = False

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options, backend=DoclingParseV4DocumentBackend
            )
        }
    )

    start_time = time.time()

    conv_results = doc_converter.convert_all(
        input_doc_paths,
        raises_on_error=False,  # to let conversion run through all and examine results at the end
    )
    success_count, partial_success_count, failure_count = export_documents(
        conv_results, output_dir=Path("markdown_manuals")
    )

    end_time = time.time() - start_time

    _log.info(f"Document conversion complete in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )

In [9]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.pipeline.base_pipeline:Processing document Volkswagen_Polo_2025.pdf
INFO:docling.document_converter:Finished converting document Volkswagen_Polo_2025.pdf in 312.05 sec.
INFO:__main__:Saved: Volkswagen_Polo_2025.md
INFO:docling.pipeline.base_pipeline:Proces

In [11]:
markdown_paths = [
    "./markdown_manuals/Volkswagen_Polo_2025.md",
    "./markdown_manuals/Fiat_Argo_2023.md",
]

In [12]:
all_documents = []

for markdown_path in markdown_paths:
    _log.info(f"--- Processando arquivo: {markdown_path} ---")

    if not os.path.exists(markdown_path):
        _log.warning(f"Arquivo não encontrado, pulando: {markdown_path}")
        continue

    filename = Path(markdown_path).stem
    try:
        brand, model, year = filename.split('_')
        _log.info(f"Metadados extraídos: Marca={brand}, Modelo={model}, Ano={year}")
    except ValueError:
        _log.error(f"O nome do arquivo '{filename}.md' não segue o padrão 'marca_modelo_ano'. Pulando.")
        continue

    loader = UnstructuredMarkdownLoader(markdown_path, mode="single")
    docs   = loader.load()

    header_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[("#", "H1"), ("##", "H2"), ("###", "H3")],
        strip_headers=False
    )
    header_chunks = header_splitter.split_text(docs[0].page_content)

    char_splitter = RecursiveCharacterTextSplitter(
        chunk_size    = 800,
        chunk_overlap = 100
    )

    final_chunks_for_file = []
    for chunk in header_chunks:
        chunk.metadata['brand'] = brand
        chunk.metadata['model'] = model
        chunk.metadata['year'] = year
        chunk.metadata['source'] = filename

        if len(chunk.page_content) > 1000:
            final_chunks_for_file.extend(char_splitter.split_documents([chunk]))
        else:
            final_chunks_for_file.append(chunk)
    
    all_documents.extend(final_chunks_for_file)
    _log.info(f"Arquivo processado. {len(final_chunks_for_file)} chunks foram criados e adicionados.")

if not all_documents:
    _log.warning("Nenhum documento para ingerir. Finalizando o script.")
else:
    _log.info(f"Total de {len(all_documents)} chunks de todos os arquivos prontos para ingestão.")
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    MILVUS_HOST = "localhost"
    MILVUS_PORT = "19530"
    MILVUS_COLLECTION_NAME = "manuals"

    milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}"
    connection_args = {"uri": milvus_uri}

    vectorstore = Milvus.from_documents(
        documents=all_documents,
        collection_name=MILVUS_COLLECTION_NAME,
        embedding=embeddings,
        connection_args=connection_args,
        auto_id=True,
        consistency_level="Strong",
        search_params={"metric_type": "L2", "params": {"nprobe": 10}}
    )

    _log.info("Todos os documentos foram ingeridos no Milvus com sucesso!")

INFO:__main__:--- Processando arquivo: ./markdown_manuals/Volkswagen_Polo_2025.md ---
INFO:__main__:Metadados extraídos: Marca=Volkswagen, Modelo=Polo, Ano=2025
INFO:__main__:Arquivo processado. 1301 chunks foram criados e adicionados.
INFO:__main__:--- Processando arquivo: ./markdown_manuals/Fiat_Argo_2023.md ---
INFO:__main__:Metadados extraídos: Marca=Fiat, Modelo=Argo, Ano=2023
INFO:__main__:Arquivo processado. 644 chunks foram criados e adicionados.
INFO:__main__:Total de 1945 chunks de todos os arquivos prontos para ingestão.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Todos os documentos foram ingeridos no Milvus com sucesso!


In [13]:
vectorstore = Milvus(
    collection_name="manuals",
    embedding_function=embeddings,
    connection_args={"host": "localhost", "port": "19530"},
    auto_id=True,
    consistency_level="Strong",
    search_params={"metric_type": "L2", "params": {"nprobe": 10}}
)