Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Set, Union
from typing import TYPE_CHECKING, Union

from docling_core.types.doc import DoclingDocument

from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions

if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
Expand Down Expand Up @@ -35,7 +37,7 @@ def unload(self):

@classmethod
@abstractmethod
def supported_formats(cls) -> Set["InputFormat"]:
def supported_formats(cls) -> set["InputFormat"]:
pass


Expand All @@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
straight without a recognition pipeline.
"""

@abstractmethod
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
options: BackendOptions = DeclarativeBackendOptions(),
) -> None:
super().__init__(in_doc, path_or_stream)
self.options: BackendOptions = options

@abstractmethod
def convert(self) -> DoclingDocument:
pass

@classmethod
def get_default_options(cls) -> BackendOptions:
return DeclarativeBackendOptions()
6 changes: 3 additions & 3 deletions docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
from io import BytesIO
from pathlib import Path
from typing import Final, Set, Union
from typing import Final, Union

from docling_core.types.doc import (
DocItemLabel,
Expand All @@ -27,7 +27,7 @@


class AsciiDocBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

self.path_or_stream = path_or_stream
Expand Down Expand Up @@ -58,7 +58,7 @@ def unload(self):
return

@classmethod
def supported_formats(cls) -> Set[InputFormat]:
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.ASCIIDOC}

def convert(self) -> DoclingDocument:
Expand Down
155 changes: 135 additions & 20 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import base64
import logging
import os
import re
import traceback
import warnings
from contextlib import contextmanager
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
from docling_core.types.doc import (
Expand All @@ -17,20 +20,26 @@
DocumentOrigin,
GroupItem,
GroupLabel,
PictureItem,
RefItem,
RichTableCell,
TableCell,
TableData,
TableItem,
TextItem,
)
from docling_core.types.doc.document import ContentLayer, Formatting, Script
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl, BaseModel, ValidationError
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
)
from docling.datamodel.backend_options import HTMLBackendOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.exceptions import OperationNotAllowed

_log = logging.getLogger(__name__)

Expand All @@ -43,6 +52,7 @@
"details",
"figure",
"footer",
"img",
"h1",
"h2",
"h3",
Expand Down Expand Up @@ -186,11 +196,12 @@ def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
original_url: Optional[AnyUrl] = None,
options: HTMLBackendOptions = HTMLBackendOptions(),
):
super().__init__(in_doc, path_or_stream)
super().__init__(in_doc, path_or_stream, options)
self.soup: Optional[Tag] = None
self.path_or_stream = path_or_stream
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
self.base_path: Optional[str] = str(options.source_uri)

# Initialize the parents for the hierarchy
self.max_levels = 10
Expand All @@ -200,7 +211,6 @@ def __init__(
for i in range(self.max_levels):
self.parents[i] = None
self.hyperlink: Union[AnyUrl, Path, None] = None
self.original_url = original_url
self.format_tags: list[str] = []

try:
Expand Down Expand Up @@ -236,6 +246,11 @@ def unload(self):
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.HTML}

@classmethod
@override
def get_default_options(cls) -> HTMLBackendOptions:
return HTMLBackendOptions()

@override
def convert(self) -> DoclingDocument:
_log.debug("Starting HTML conversion...")
Expand All @@ -261,7 +276,7 @@ def convert(self) -> DoclingDocument:
content_layer=ContentLayer.FURNITURE,
)
# remove script and style tags
for tag in self.soup(["script", "style"]):
for tag in self.soup(["script", "noscript", "style"]):
tag.decompose()
# remove any hidden tag
for tag in self.soup(hidden=True):
Expand Down Expand Up @@ -291,6 +306,28 @@ def convert(self) -> DoclingDocument:
self._walk(content, doc)
return doc

@staticmethod
def _is_remote_url(value: str) -> bool:
parsed = urlparse(value)
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}

def _resolve_relative_path(self, loc: str) -> str:
abs_loc = loc

if self.base_path:
if loc.startswith("//"):
# Protocol-relative URL - default to https
abs_loc = "https:" + loc
elif not loc.startswith(("http://", "https://", "data:", "file://")):
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
abs_loc = urljoin(self.base_path, loc)
elif self.base_path: # local fetch
# For local files, resolve relative to the HTML file location
abs_loc = str(Path(self.base_path).parent / loc)

_log.debug(f"Resolved location {loc} to {abs_loc}")
return abs_loc

@staticmethod
def group_cell_elements(
group_name: str,
Expand Down Expand Up @@ -520,7 +557,8 @@ def flush_buffer():
if name == "img":
flush_buffer()
im_ref3 = self._emit_image(node, doc)
added_refs.append(im_ref3)
if im_ref3:
added_refs.append(im_ref3)
elif name in _FORMAT_TAG_MAP:
with self._use_format([name]):
wk = self._walk(node, doc)
Expand Down Expand Up @@ -669,8 +707,7 @@ def _use_hyperlink(self, tag: Tag):
else:
if isinstance(this_href, str) and this_href:
old_hyperlink = self.hyperlink
if self.original_url is not None:
this_href = urljoin(str(self.original_url), str(this_href))
this_href = self._resolve_relative_path(this_href)
# ugly fix for relative links since pydantic does not support them.
try:
new_hyperlink = AnyUrl(this_href)
Expand Down Expand Up @@ -837,7 +874,8 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
im_ref = self._emit_image(img_tag, doc)
added_ref.append(im_ref)
if im_ref:
added_ref.append(im_ref)
return added_ref

def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
Expand Down Expand Up @@ -1003,7 +1041,8 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
img_tag = tag.find("img")
if isinstance(img_tag, Tag):
im_ref = self._emit_image(img_tag, doc)
added_refs.append(im_ref)
if im_ref is not None:
added_refs.append(im_ref)

elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
heading_refs = self._handle_heading(tag, doc)
Expand Down Expand Up @@ -1061,7 +1100,8 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
im_ref2 = self._emit_image(tag, doc)
added_refs.append(im_ref2)
if im_ref2 is not None:
added_refs.append(im_ref2)

elif tag_name in {"pre"}:
# handle monospace code snippets (pre).
Expand Down Expand Up @@ -1092,10 +1132,12 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
self._walk(tag, doc)
return added_refs

def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
figure = img_tag.find_parent("figure")
caption: AnnotatedTextList = AnnotatedTextList()

parent = self.parents[self.level]

# check if the figure has a link - this is HACK:
def get_img_hyperlink(img_tag):
this_parent = img_tag.parent
Expand All @@ -1106,9 +1148,8 @@ def get_img_hyperlink(img_tag):
return None

if img_hyperlink := get_img_hyperlink(img_tag):
caption.append(
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
)
img_text = img_tag.get("alt") or ""
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))

if isinstance(figure, Tag):
caption_tag = figure.find("figcaption", recursive=False)
Expand All @@ -1135,13 +1176,78 @@ def get_img_hyperlink(img_tag):
hyperlink=caption_anno_text.hyperlink,
)

src_loc: str = self._get_attr_as_string(img_tag, "src")
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
# Do not fetch the image, just add a placeholder
placeholder: PictureItem = doc.add_picture(
caption=caption_item,
parent=parent,
content_layer=self.content_layer,
)
return placeholder.get_ref()

src_loc = self._resolve_relative_path(src_loc)
img_ref = self._create_image_ref(src_loc)

docling_pic = doc.add_picture(
image=img_ref,
caption=caption_item,
parent=self.parents[self.level],
parent=parent,
content_layer=self.content_layer,
)
return docling_pic.get_ref()

def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
try:
img_data = self._load_image_data(src_url)
if img_data:
img = Image.open(BytesIO(img_data))
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
except (
requests.HTTPError,
ValidationError,
UnidentifiedImageError,
OperationNotAllowed,
TypeError,
ValueError,
) as e:
warnings.warn(f"Could not process an image from {src_url}: {e}")

return None

def _load_image_data(self, src_loc: str) -> Optional[bytes]:
if src_loc.lower().endswith(".svg"):
_log.debug(f"Skipping SVG file: {src_loc}")
return None

if HTMLDocumentBackend._is_remote_url(src_loc):
if not self.options.enable_remote_fetch:
raise OperationNotAllowed(
"Fetching remote resources is only allowed when set explicitly. "
"Set options.enable_remote_fetch=True."
)
response = requests.get(src_loc, stream=True)
response.raise_for_status()
return response.content
elif src_loc.startswith("data:"):
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
return base64.b64decode(data)

if src_loc.startswith("file://"):
src_loc = src_loc[7:]

if not self.options.enable_local_fetch:
raise OperationNotAllowed(
"Fetching local resources is only allowed when set explicitly. "
"Set options.enable_local_fetch=True."
)
# add check that file exists and can read
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
with open(src_loc, "rb") as f:
return f.read()
else:
raise ValueError("File does not exist or it is not readable.")

@staticmethod
def get_text(item: PageElement) -> str:
"""Concatenate all child strings of a PageElement.
Expand Down Expand Up @@ -1238,3 +1344,12 @@ def _extract_num(s: str) -> int:
)

return int_spans

@staticmethod
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
"""Get attribute value as string, handling list values."""
value = tag.get(attr)
if not value:
return default

return value[0] if isinstance(value, list) else value
Loading