In [1]:
from pathlib import Path

In [2]:
# Updated imports to support OCR configuration
from docling.datamodel.document import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.pipeline_options import PdfPipelineOptions

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
notebook_dir = Path.cwd()
PROJECT_ROOT = notebook_dir.parent

In [4]:
print(f"Notebook is in: {notebook_dir}")
print(f"Project root is: {PROJECT_ROOT}")

Notebook is in: /home/fliperbaker/projects/rag1/rag1-mini/notebooks
Project root is: /home/fliperbaker/projects/rag1/rag1-mini


In [8]:
source_file = PROJECT_ROOT / "data" / "raw" / "neuroscience" / "PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.pdf"
destination_file = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.md"

In [9]:
# 1. Define Pipeline Options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False           # <--- Force OCR OFF (Fast & clean for digital docs)
pipeline_options.do_table_structure = False # Keep table recognition ON

In [10]:
# 2. Initialize Converter with specific PDF options
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [11]:
result = converter.convert(source_file)

2025-12-02 04:17:29,292 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-02 04:17:29,580 - INFO - Going to convert document batch...
2025-12-02 04:17:29,581 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 1216607fb7e04989285a12764e030fc9
2025-12-02 04:17:29,588 - INFO - Loading plugin 'docling_defaults'
2025-12-02 04:17:29,592 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-02 04:17:29,597 - INFO - Loading plugin 'docling_defaults'
2025-12-02 04:17:29,603 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-02 04:17:29,613 - INFO - Accelerator device: 'cpu'
2025-12-02 04:17:30,229 - INFO - Processing document PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.pdf
2025-12-02 04:28:33,683 - INFO - Finished converting document PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar.pdf in 650.93 

In [9]:
#destination_file_notable = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_notable.md"

In [12]:
destination_file.write_text(result.document.export_to_markdown(), encoding="utf-8")

2449930

In [15]:
# 2. Access the internal DoclingDocument object
doc = result.document

# 3. Option A: Print all metadata structure as a Dictionary/JSON
# This reveals everything: bbox, page_no, structure, font info, etc.
data = doc.export_to_dict()

In [13]:
destination_json = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior.json"

In [16]:
# 4. Option B: Save to a JSON file for easier inspection in a text editor
doc.save_as_json(destination_json)

## possible labels of docling items
https://github.com/docling-project/docling-core/blob/main/docling_core/types/doc/labels.py
```
class DocItemLabel(str, Enum):
    """DocItemLabel."""

    CAPTION = "caption"
    CHART = "chart"
    FOOTNOTE = "footnote"
    FORMULA = "formula"
    LIST_ITEM = "list_item"
    PAGE_FOOTER = "page_footer"
    PAGE_HEADER = "page_header"
    PICTURE = "picture"
    SECTION_HEADER = "section_header"
    TABLE = "table"
    TEXT = "text"
    TITLE = "title"
    DOCUMENT_INDEX = "document_index"
    CODE = "code"
    CHECKBOX_SELECTED = "checkbox_selected"
    CHECKBOX_UNSELECTED = "checkbox_unselected"
    FORM = "form"
    KEY_VALUE_REGION = "key_value_region"
    GRADING_SCALE = "grading_scale"  # for elements in forms, questionaires representing a grading scale
    # e.g. [strongly disagree | ... | ... | strongly agree]
    # e.g. ★★☆☆☆
    HANDWRITTEN_TEXT = "handwritten_text"
    EMPTY_VALUE = "empty_value"  # used for empty value fields in fillable forms

    # Additional labels for markup-based formats (e.g. HTML, Word)
    PARAGRAPH = "paragraph"
    REFERENCE = "reference"```

## delete unwanted items

In [17]:
from docling.document_converter import DocumentConverter
from docling.datamodel.document import DocItemLabel

In [19]:
# 1. Convert the document
# converter = DocumentConverter()
# result = converter.convert("path/to/document.pdf")
# doc = result.document

# 2. Identify items to remove (e.g., remove all Captions and Footnotes)
# We collect them in a list first to avoid modifying the tree while iterating
items_to_remove = []
labels_to_remove = {DocItemLabel.CAPTION, DocItemLabel.FOOTNOTE, 
                    DocItemLabel.PAGE_FOOTER, DocItemLabel.PAGE_HEADER, 
                    DocItemLabel.TABLE}

for item, level in doc.iterate_items():
    # Check if the item has a label and if it matches our target list
    if hasattr(item, "label") and item.label in labels_to_remove:
        items_to_remove.append(item)

# 3. Delete the items from the document
# This updates the document tree in-place
doc.delete_items(node_items=items_to_remove)

In [18]:
destination_file_removeditems = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_removeditems.md"

In [20]:
# 4. Now 'doc' can be passed to your RAG pipeline (e.g., chunking or export)
destination_file_removeditems.write_text(doc.export_to_markdown(), encoding="utf-8")

2312931

In [21]:
destination_file_json_removeditems = PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_removeditems.json"

In [22]:
doc.save_as_json(destination_file_json_removeditems)

In [25]:
# 1. Helper function to find all children recursively
def get_all_descendants(item):
    """Recursively collect all children, grandchildren, etc."""
    descendants = []
    # Check if the item has children
    if hasattr(item, "children") and item.children:
        for child in item.children:
            descendants.append(child)
            # Recursively get children of the child
            descendants.extend(get_all_descendants(child))
    return descendants

## delete pictures and its children
removes picture elements that contain texts inside the picture. No OCR is done but small texts like (a) (b) digital text inside the picture makes the converter to create picture items (empty) that contains small texts. This is to remove that noise

In [27]:
# 3. Identify items to remove
items_to_remove = []   # Use a list instead of a set
seen_ids = set()       # Track IDs to avoid duplicates

for item, level in doc.iterate_items():
    # Check if it is a Picture
    if hasattr(item, "label") and item.label == DocItemLabel.PICTURE:
        
        # A. Add the Picture item itself (if not already added)
        if id(item) not in seen_ids:
            items_to_remove.append(item)
            seen_ids.add(id(item))
        
        # B. Get all children (captions, texts inside)
        children = get_all_descendants(item)
        
        for child in children:
            if id(child) not in seen_ids:
                items_to_remove.append(child)
                seen_ids.add(id(child))

# 4. Delete the items
if items_to_remove:
    print(f"Removing {len(items_to_remove)} items...")
    doc.delete_items(node_items=items_to_remove)

Removing 7404 items...


In [28]:
destination_file_json_removedpictures= PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_removedpictures.json"

In [29]:
doc.save_as_json(destination_file_json_removedpictures)

In [30]:
destination_file_removedpictures= PROJECT_ROOT / "data" / "processed" / "neuroscience" / "PRECLEAN_Brain_and_behavior_removedpictures.md"

In [31]:
destination_file_removedpictures.write_text(doc.export_to_markdown(), encoding="utf-8")

2302755