Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,31 @@ repos:
hooks:
- id: black
name: Black
entry: poetry run black docling_core
entry: poetry run black docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: isort
name: isort
entry: poetry run isort docling_core
entry: poetry run isort docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: autoflake
name: autoflake
entry: poetry run autoflake docling_core
entry: poetry run autoflake docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: mypy
name: MyPy
entry: poetry run mypy docling_core
entry: poetry run mypy docling_core test
pass_filenames: false
language: system
files: '\.py$'
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is

## Documentation

We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Deep Search objects.
We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects.

The documentation pages are stored in [docs](./docs/) folder and are updated at every commit, as part of the `pre-commit` check hooks.
To generate the documentation on-demand, run:
Expand Down
16 changes: 10 additions & 6 deletions docling_core/types/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
ProvenanceTypeT = TypeVar("ProvenanceTypeT", bound=str)
CollectionNameTypeT = TypeVar("CollectionNameTypeT", bound=str)
Coordinates = Annotated[
list[float],
Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
]
T = TypeVar("T", bound=Hashable)

UniqueList = Annotated[
Expand All @@ -61,7 +65,7 @@


class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
"""Unique identifier of a Deep Search data object."""
"""Unique identifier of a Docling data object."""

type_: IdentifierTypeT = Field(
alias="type",
Expand All @@ -81,7 +85,7 @@ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
alias="_name",
title="_Name",
description=(
"A unique identifier of the data object across Deep Search, consisting of "
"A unique identifier of the data object across Docling, consisting of "
"the concatenation of type and value in lower case, separated by hash "
"(#)."
),
Expand Down Expand Up @@ -118,7 +122,7 @@ class Log(AliasModel, extra="forbid"):
json_schema_extra=es_field(type="keyword", ignore_above=8191),
)
agent: StrictStr = Field(
description="The Deep Search agent that performed the task, e.g., CCS or CXS.",
description="The Docling agent that performed the task, e.g., CCS or CXS.",
json_schema_extra=es_field(type="keyword", ignore_above=8191),
)
type_: StrictStr = Field(
Expand All @@ -138,7 +142,7 @@ class Log(AliasModel, extra="forbid"):


class FileInfoObject(AliasModel):
"""Filing information for any data object to be stored in a Deep Search database."""
"""Filing information for any data object to be stored in a Docling database."""

filename: StrictStr = Field(
description="The name of a persistent object that created this data object",
Expand All @@ -156,15 +160,15 @@ class FileInfoObject(AliasModel):
document_hash: StrictStr = Field(
description=(
"A unique identifier of this data object within a collection of a "
"Deep Search database"
"Docling database"
),
alias="document-hash",
json_schema_extra=es_field(type="keyword", ignore_above=8191),
)


class CollectionTypeEnum(str, Enum):
"""Enumeration of valid Deep Search collection types."""
"""Enumeration of valid Docling collection types."""

generic = "Generic"
document = "Document"
Expand Down
4 changes: 2 additions & 2 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MIT
#

"""Models for the Deep Search Document data type."""
"""Models for the Docling Document data type."""

from datetime import datetime
from typing import Generic, Optional, Union
Expand Down Expand Up @@ -352,7 +352,7 @@ class ExportedCCSDocument(
CollectionNameTypeT,
],
):
"""Document model for Deep Search."""
"""Document model for Docling."""

obj_type: StrictStr = Field(
"pdf-document",
Expand Down
9 changes: 3 additions & 6 deletions docling_core/types/rec/attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""Define the model Attribute."""
from typing import Generic, Optional

from pydantic import BaseModel, Field
from pydantic import Field
from typing_extensions import Annotated

from docling_core.search.mapping import es_field
Expand All @@ -16,23 +16,20 @@
PredicateKeyTypeT,
PredicateValueTypeT,
ProvenanceTypeT,
SubjectNameTypeT,
SubjectTypeT,
)
from docling_core.types.rec.base import ProvenanceItem
from docling_core.types.rec.predicate import Predicate
from docling_core.utils.alias import AliasModel


class Attribute(
BaseModel,
AliasModel,
Generic[
IdentifierTypeT,
PredicateValueTypeT,
PredicateKeyNameT,
PredicateKeyTypeT,
ProvenanceTypeT,
SubjectTypeT,
SubjectNameTypeT,
],
extra="forbid",
):
Expand Down
18 changes: 8 additions & 10 deletions docling_core/types/rec/predicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

"""Define the model Predicate."""
from datetime import datetime
from typing import Annotated, Generic, Optional, TypeVar
from typing import Annotated, Generic, Optional

from pydantic import (
BaseModel,
Expand All @@ -17,16 +17,14 @@
)

from docling_core.search.mapping import es_field
from docling_core.types.base import (
Coordinates,
PredicateKeyNameT,
PredicateKeyTypeT,
PredicateValueTypeT,
)
from docling_core.utils.alias import AliasModel

PredicateValueTypeT = TypeVar("PredicateValueTypeT", bound=str)
PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str)
PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
Coordinates = Annotated[
list[float],
Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
]


class NumericalValue(BaseModel, extra="forbid"):
"""Model for numerical values."""
Expand Down Expand Up @@ -117,7 +115,7 @@ class PredicateValue(AliasModel, Generic[PredicateValueTypeT], extra="forbid"):


class Predicate(
BaseModel,
AliasModel,
Generic[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT],
extra="forbid",
):
Expand Down
2 changes: 0 additions & 2 deletions docling_core/types/rec/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ class Record(
PredicateKeyNameT,
PredicateKeyTypeT,
ProvenanceTypeT,
SubjectTypeT,
SubjectNameTypeT,
]
]
] = None
Expand Down
6 changes: 5 additions & 1 deletion docling_core/types/rec/subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
from docling_core.utils.alias import AliasModel


class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]):
"""Identifier of subject names.""" ""


class Subject(
AliasModel,
Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT],
Expand Down Expand Up @@ -53,7 +57,7 @@ class Subject(
),
json_schema_extra=es_field(type="keyword", ignore_above=8191),
)
names: list[Identifier[SubjectNameTypeT]] = Field(
names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field(
description=(
"List of given names for this subject. They may not be unique across "
"different subjects."
Expand Down
8 changes: 4 additions & 4 deletions docling_core/utils/ds_generate_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _prepare_directory(folder: str, clean: bool = False) -> None:


def generate_collection_jsonschema(folder: str):
"""Generate the JSON schema of Deep Search collections and export them to a folder.
"""Generate the JSON schema of Docling collections and export them to a folder.

Args:
folder: The name of the directory.
Expand All @@ -58,7 +58,7 @@ def generate_collection_jsonschema(folder: str):


def generate_collection_html(folder: str):
"""Generate HTML pages documenting the data model of Deep Search collections.
"""Generate HTML pages documenting the data model of Docling collections.

The JSON schemas files need to be in a folder and the generated HTML pages will be
written in the same folder.
Expand All @@ -79,7 +79,7 @@ def generate_collection_html(folder: str):


def generate_collection_markdown(folder: str):
"""Generate Markdown pages documenting the data model of Deep Search collections.
"""Generate Markdown pages documenting the data model of Docling collections.

The JSON schemas files need to be in a folder and the generated markdown pages will
be written in the same folder.
Expand All @@ -101,7 +101,7 @@ def generate_collection_markdown(folder: str):


def main() -> None:
"""Generate the JSON Schema of Deep Search collections and export documentation."""
"""Generate the JSON Schema of Docling collections and export documentation."""
argparser = argparse.ArgumentParser()
argparser.add_argument(
"directory",
Expand Down
10 changes: 5 additions & 5 deletions docs/Document.json
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@
"x-es-type": "keyword"
},
"document-hash": {
"description": "A unique identifier of this data object within a collection of a Deep Search database",
"description": "A unique identifier of this data object within a collection of a Docling database",
"title": "Document-Hash",
"type": "string",
"x-es-ignore_above": 8191,
Expand Down Expand Up @@ -1086,7 +1086,7 @@
},
"Identifier": {
"additionalProperties": false,
"description": "Unique identifier of a Deep Search data object.",
"description": "Unique identifier of a Docling data object.",
"properties": {
"type": {
"description": "A string representing a collection or database that contains this data object.",
Expand All @@ -1103,7 +1103,7 @@
"x-es-type": "keyword"
},
"_name": {
"description": "A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#).",
"description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).",
"pattern": "^.+#.+$",
"title": "_Name",
"type": "string",
Expand Down Expand Up @@ -1139,7 +1139,7 @@
"x-es-type": "keyword"
},
"agent": {
"description": "The Deep Search agent that performed the task, e.g., CCS or CXS.",
"description": "The Docling agent that performed the task, e.g., CCS or CXS.",
"title": "Agent",
"type": "string",
"x-es-ignore_above": 8191,
Expand Down Expand Up @@ -1725,7 +1725,7 @@
"type": "object"
}
},
"description": "Document model for Deep Search.",
"description": "Document model for Docling.",
"properties": {
"_name": {
"title": " Name",
Expand Down
Loading