docling-project · dolfim-ibm · Oct 21, 2025 · Jul 26, 2025 · Jul 27, 2025 · Jul 28, 2025
diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Set, Union
+from typing import TYPE_CHECKING, Union
 
 from docling_core.types.doc import DoclingDocument
 
+from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
+
 if TYPE_CHECKING:
     from docling.datamodel.base_models import InputFormat
     from docling.datamodel.document import InputDocument
@@ -35,7 +37,7 @@ def unload(self):
 
     @classmethod
     @abstractmethod
-    def supported_formats(cls) -> Set["InputFormat"]:
+    def supported_formats(cls) -> set["InputFormat"]:
         pass
 
 
@@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
     straight without a recognition pipeline.
     """
 
+    @abstractmethod
+    def __init__(
+        self,
+        in_doc: "InputDocument",
+        path_or_stream: Union[BytesIO, Path],
+        options: BackendOptions = DeclarativeBackendOptions(),
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+        self.options: BackendOptions = options
+
     @abstractmethod
     def convert(self) -> DoclingDocument:
         pass
+
+    @classmethod
+    def get_default_options(cls) -> BackendOptions:
+        return DeclarativeBackendOptions()
diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py
@@ -2,7 +2,7 @@
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Final, Set, Union
+from typing import Final, Union
 
 from docling_core.types.doc import (
     DocItemLabel,
@@ -27,7 +27,7 @@
 
 
 class AsciiDocBackend(DeclarativeDocumentBackend):
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 
         self.path_or_stream = path_or_stream
@@ -58,7 +58,7 @@ def unload(self):
         return
 
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.ASCIIDOC}
 
     def convert(self) -> DoclingDocument:

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -1,13 +1,16 @@
+import base64
 import logging
+import os
 import re
-import traceback
+import warnings
 from contextlib import contextmanager
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse
 
+import requests
 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from bs4.element import PreformattedString
 from docling_core.types.doc import (
@@ -17,20 +20,26 @@
     DocumentOrigin,
     GroupItem,
     GroupLabel,
+    PictureItem,
     RefItem,
     RichTableCell,
     TableCell,
     TableData,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.document import ContentLayer, Formatting, Script
+from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
+from PIL import Image, UnidentifiedImageError
 from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override
 
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+)
+from docling.datamodel.backend_options import HTMLBackendOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
+from docling.exceptions import OperationNotAllowed
 
 _log = logging.getLogger(__name__)
 
@@ -43,6 +52,7 @@
     "details",
     "figure",
     "footer",
+    "img",
     "h1",
     "h2",
     "h3",
@@ -186,11 +196,12 @@ def __init__(
         self,
         in_doc: InputDocument,
         path_or_stream: Union[BytesIO, Path],
-        original_url: Optional[AnyUrl] = None,
+        options: HTMLBackendOptions = HTMLBackendOptions(),
     ):
-        super().__init__(in_doc, path_or_stream)
+        super().__init__(in_doc, path_or_stream, options)
         self.soup: Optional[Tag] = None
-        self.path_or_stream = path_or_stream
+        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
+        self.base_path: Optional[str] = str(options.source_uri)
 
         # Initialize the parents for the hierarchy
         self.max_levels = 10
@@ -200,7 +211,6 @@ def __init__(
         for i in range(self.max_levels):
             self.parents[i] = None
         self.hyperlink: Union[AnyUrl, Path, None] = None
-        self.original_url = original_url
         self.format_tags: list[str] = []
 
         try:
@@ -236,6 +246,11 @@ def unload(self):
     def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.HTML}
 
+    @classmethod
+    @override
+    def get_default_options(cls) -> HTMLBackendOptions:
+        return HTMLBackendOptions()
+
     @override
     def convert(self) -> DoclingDocument:
         _log.debug("Starting HTML conversion...")
@@ -261,7 +276,7 @@ def convert(self) -> DoclingDocument:
                 content_layer=ContentLayer.FURNITURE,
             )
         # remove script and style tags
-        for tag in self.soup(["script", "style"]):
+        for tag in self.soup(["script", "noscript", "style"]):
             tag.decompose()
         # remove any hidden tag
         for tag in self.soup(hidden=True):
@@ -291,6 +306,28 @@ def convert(self) -> DoclingDocument:
         self._walk(content, doc)
         return doc
 
+    @staticmethod
+    def _is_remote_url(value: str) -> bool:
+        parsed = urlparse(value)
+        return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
+
+    def _resolve_relative_path(self, loc: str) -> str:
+        abs_loc = loc
+
+        if self.base_path:
+            if loc.startswith("//"):
+                # Protocol-relative URL - default to https
+                abs_loc = "https:" + loc
+            elif not loc.startswith(("http://", "https://", "data:", "file://")):
+                if HTMLDocumentBackend._is_remote_url(self.base_path):  # remote fetch
+                    abs_loc = urljoin(self.base_path, loc)
+                elif self.base_path:  # local fetch
+                    # For local files, resolve relative to the HTML file location
+                    abs_loc = str(Path(self.base_path).parent / loc)
+
+        _log.debug(f"Resolved location {loc} to {abs_loc}")
+        return abs_loc
+
     @staticmethod
     def group_cell_elements(
         group_name: str,
@@ -520,7 +557,8 @@ def flush_buffer():
                 if name == "img":
                     flush_buffer()
                     im_ref3 = self._emit_image(node, doc)
-                    added_refs.append(im_ref3)
+                    if im_ref3:
+                        added_refs.append(im_ref3)
                 elif name in _FORMAT_TAG_MAP:
                     with self._use_format([name]):
                         wk = self._walk(node, doc)
@@ -669,8 +707,7 @@ def _use_hyperlink(self, tag: Tag):
         else:
             if isinstance(this_href, str) and this_href:
                 old_hyperlink = self.hyperlink
-                if self.original_url is not None:
-                    this_href = urljoin(str(self.original_url), str(this_href))
+                this_href = self._resolve_relative_path(this_href)
                 # ugly fix for relative links since pydantic does not support them.
                 try:
                     new_hyperlink = AnyUrl(this_href)
@@ -837,7 +874,8 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
         for img_tag in tag("img"):
             if isinstance(img_tag, Tag):
                 im_ref = self._emit_image(img_tag, doc)
-                added_ref.append(im_ref)
+                if im_ref:
+                    added_ref.append(im_ref)
         return added_ref
 
     def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
@@ -1003,7 +1041,8 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
             img_tag = tag.find("img")
             if isinstance(img_tag, Tag):
                 im_ref = self._emit_image(img_tag, doc)
-                added_refs.append(im_ref)
+                if im_ref is not None:
+                    added_refs.append(im_ref)
 
         elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
             heading_refs = self._handle_heading(tag, doc)
@@ -1061,7 +1100,8 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
             for img_tag in tag("img"):
                 if isinstance(img_tag, Tag):
                     im_ref2 = self._emit_image(tag, doc)
-                    added_refs.append(im_ref2)
+                    if im_ref2 is not None:
+                        added_refs.append(im_ref2)
 
         elif tag_name in {"pre"}:
             # handle monospace code snippets (pre).
@@ -1092,10 +1132,12 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
                 self._walk(tag, doc)
         return added_refs
 
-    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
+    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
         figure = img_tag.find_parent("figure")
         caption: AnnotatedTextList = AnnotatedTextList()
 
+        parent = self.parents[self.level]
+
         # check if the figure has a link - this is HACK:
         def get_img_hyperlink(img_tag):
             this_parent = img_tag.parent
@@ -1106,9 +1148,8 @@ def get_img_hyperlink(img_tag):
             return None
 
         if img_hyperlink := get_img_hyperlink(img_tag):
-            caption.append(
-                AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
-            )
+            img_text = img_tag.get("alt") or ""
+            caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
 
         if isinstance(figure, Tag):
             caption_tag = figure.find("figcaption", recursive=False)
@@ -1135,13 +1176,78 @@ def get_img_hyperlink(img_tag):
                 hyperlink=caption_anno_text.hyperlink,
             )
 
+        src_loc: str = self._get_attr_as_string(img_tag, "src")
+        if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
+            # Do not fetch the image, just add a placeholder
+            placeholder: PictureItem = doc.add_picture(
+                caption=caption_item,
+                parent=parent,
+                content_layer=self.content_layer,
+            )
+            return placeholder.get_ref()
+
+        src_loc = self._resolve_relative_path(src_loc)
+        img_ref = self._create_image_ref(src_loc)
+
         docling_pic = doc.add_picture(
+            image=img_ref,
             caption=caption_item,
-            parent=self.parents[self.level],
+            parent=parent,
             content_layer=self.content_layer,
         )
         return docling_pic.get_ref()
 
+    def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
+        try:
+            img_data = self._load_image_data(src_url)
+            if img_data:
+                img = Image.open(BytesIO(img_data))
+                return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
+        except (
+            requests.HTTPError,
+            ValidationError,
+            UnidentifiedImageError,
+            OperationNotAllowed,
+            TypeError,
+            ValueError,
+        ) as e:
+            warnings.warn(f"Could not process an image from {src_url}: {e}")
+
+        return None
+
+    def _load_image_data(self, src_loc: str) -> Optional[bytes]:
+        if src_loc.lower().endswith(".svg"):
+            _log.debug(f"Skipping SVG file: {src_loc}")
+            return None
+
+        if HTMLDocumentBackend._is_remote_url(src_loc):
+            if not self.options.enable_remote_fetch:
+                raise OperationNotAllowed(
+                    "Fetching remote resources is only allowed when set explicitly. "
+                    "Set options.enable_remote_fetch=True."
+                )
+            response = requests.get(src_loc, stream=True)
+            response.raise_for_status()
+            return response.content
+        elif src_loc.startswith("data:"):
+            data = re.sub(r"^data:image/.+;base64,", "", src_loc)
+            return base64.b64decode(data)
+
+        if src_loc.startswith("file://"):
+            src_loc = src_loc[7:]
+
+        if not self.options.enable_local_fetch:
+            raise OperationNotAllowed(
+                "Fetching local resources is only allowed when set explicitly. "
+                "Set options.enable_local_fetch=True."
+            )
+        # add check that file exists and can read
+        if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
+            with open(src_loc, "rb") as f:
+                return f.read()
+        else:
+            raise ValueError("File does not exist or it is not readable.")
+
     @staticmethod
     def get_text(item: PageElement) -> str:
         """Concatenate all child strings of a PageElement.
@@ -1238,3 +1344,12 @@ def _extract_num(s: str) -> int:
         )
 
         return int_spans
+
+    @staticmethod
+    def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
+        """Get attribute value as string, handling list values."""
+        value = tag.get(attr)
+        if not value:
+            return default
+
+        return value[0] if isinstance(value, list) else value