In [1]:
from pathlib import Path

In [2]:
# Updated imports to support OCR configuration
from docling.datamodel.document import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.pipeline_options import PdfPipelineOptions

In [3]:
notebook_dir = Path.cwd()
PROJECT_ROOT = notebook_dir.parent

In [4]:
print(f"Notebook is in: {notebook_dir}")
print(f"Project root is: {PROJECT_ROOT}")

Notebook is in: /home/ccrs70/projects/rag1-mini/notebooks
Project root is: /home/ccrs70/projects/rag1-mini


In [5]:
source_file = PROJECT_ROOT / "data" / "raw" / "neuroscience" / "PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.pdf"
destination_file = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.md"

In [6]:
# 1. Define Pipeline Options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False           # <--- Force OCR OFF (Fast & clean for digital docs)
pipeline_options.do_table_structure = False # Keep table recognition ON

In [7]:
# 2. Initialize Converter with specific PDF options
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [8]:
result = converter.convert(source_file)

2025-12-01 12:25:47,291 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-01 12:25:47,844 - INFO - Going to convert document batch...
2025-12-01 12:25:47,845 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 1216607fb7e04989285a12764e030fc9
2025-12-01 12:25:47,913 - INFO - Loading plugin 'docling_defaults'
2025-12-01 12:25:47,918 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-01 12:25:47,933 - INFO - Loading plugin 'docling_defaults'
2025-12-01 12:25:47,947 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-01 12:25:47,971 - INFO - Accelerator device: 'cpu'
2025-12-01 12:25:48,956 - INFO - Processing document PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.pdf
2025-12-01 12:47:55,178 - INFO - Finished converting document PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.pdf in 1327.89

In [9]:
destination_file_notable = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_notable.md"

In [10]:
destination_file_notable.write_text(result.document.export_to_markdown(), encoding="utf-8")

2449925

In [11]:
# 2. Access the internal DoclingDocument object
doc = result.document

# 3. Option A: Print all metadata structure as a Dictionary/JSON
# This reveals everything: bbox, page_no, structure, font info, etc.
data = doc.export_to_dict()

In [12]:
# 4. Option B: Save to a JSON file for easier inspection in a text editor
doc.save_as_json("../data/processed/neuroscience/PRECLEAN_Brain_and_behavior_notable.json")