diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c936cf6d..023c6af7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: black name: Black - entry: poetry run black docling_core + entry: poetry run black docling_core test pass_filenames: false language: system files: '\.py$' @@ -12,7 +12,7 @@ repos: hooks: - id: isort name: isort - entry: poetry run isort docling_core + entry: poetry run isort docling_core test pass_filenames: false language: system files: '\.py$' @@ -20,7 +20,7 @@ repos: hooks: - id: autoflake name: autoflake - entry: poetry run autoflake docling_core + entry: poetry run autoflake docling_core test pass_filenames: false language: system files: '\.py$' @@ -28,7 +28,7 @@ repos: hooks: - id: mypy name: MyPy - entry: poetry run mypy docling_core + entry: poetry run mypy docling_core test pass_filenames: false language: system files: '\.py$' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bd44291b..55e1fa39 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -183,7 +183,7 @@ Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is ## Documentation -We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Deep Search objects. +We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects. The documentation pages are stored in [docs](./docs/) folder and are updated at every commit, as part of the `pre-commit` check hooks. To generate the documentation on-demand, run: diff --git a/docling_core/types/base.py b/docling_core/types/base.py index ed73e9dd..45508ffe 100644 --- a/docling_core/types/base.py +++ b/docling_core/types/base.py @@ -39,6 +39,10 @@ PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str) ProvenanceTypeT = TypeVar("ProvenanceTypeT", bound=str) CollectionNameTypeT = TypeVar("CollectionNameTypeT", bound=str) +Coordinates = Annotated[ + list[float], + Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")), +] T = TypeVar("T", bound=Hashable) UniqueList = Annotated[ @@ -61,7 +65,7 @@ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"): - """Unique identifier of a Deep Search data object.""" + """Unique identifier of a Docling data object.""" type_: IdentifierTypeT = Field( alias="type", @@ -81,7 +85,7 @@ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"): alias="_name", title="_Name", description=( - "A unique identifier of the data object across Deep Search, consisting of " + "A unique identifier of the data object across Docling, consisting of " "the concatenation of type and value in lower case, separated by hash " "(#)." ), @@ -118,7 +122,7 @@ class Log(AliasModel, extra="forbid"): json_schema_extra=es_field(type="keyword", ignore_above=8191), ) agent: StrictStr = Field( - description="The Deep Search agent that performed the task, e.g., CCS or CXS.", + description="The Docling agent that performed the task, e.g., CCS or CXS.", json_schema_extra=es_field(type="keyword", ignore_above=8191), ) type_: StrictStr = Field( @@ -138,7 +142,7 @@ class Log(AliasModel, extra="forbid"): class FileInfoObject(AliasModel): - """Filing information for any data object to be stored in a Deep Search database.""" + """Filing information for any data object to be stored in a Docling database.""" filename: StrictStr = Field( description="The name of a persistent object that created this data object", @@ -156,7 +160,7 @@ class FileInfoObject(AliasModel): document_hash: StrictStr = Field( description=( "A unique identifier of this data object within a collection of a " - "Deep Search database" + "Docling database" ), alias="document-hash", json_schema_extra=es_field(type="keyword", ignore_above=8191), @@ -164,7 +168,7 @@ class FileInfoObject(AliasModel): class CollectionTypeEnum(str, Enum): - """Enumeration of valid Deep Search collection types.""" + """Enumeration of valid Docling collection types.""" generic = "Generic" document = "Document" diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index cedc420f..a5541057 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MIT # -"""Models for the Deep Search Document data type.""" +"""Models for the Docling Document data type.""" from datetime import datetime from typing import Generic, Optional, Union @@ -352,7 +352,7 @@ class ExportedCCSDocument( CollectionNameTypeT, ], ): - """Document model for Deep Search.""" + """Document model for Docling.""" obj_type: StrictStr = Field( "pdf-document", diff --git a/docling_core/types/rec/attribute.py b/docling_core/types/rec/attribute.py index 98e3c761..9639abbb 100644 --- a/docling_core/types/rec/attribute.py +++ b/docling_core/types/rec/attribute.py @@ -6,7 +6,7 @@ """Define the model Attribute.""" from typing import Generic, Optional -from pydantic import BaseModel, Field +from pydantic import Field from typing_extensions import Annotated from docling_core.search.mapping import es_field @@ -16,23 +16,20 @@ PredicateKeyTypeT, PredicateValueTypeT, ProvenanceTypeT, - SubjectNameTypeT, - SubjectTypeT, ) from docling_core.types.rec.base import ProvenanceItem from docling_core.types.rec.predicate import Predicate +from docling_core.utils.alias import AliasModel class Attribute( - BaseModel, + AliasModel, Generic[ IdentifierTypeT, PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT, ProvenanceTypeT, - SubjectTypeT, - SubjectNameTypeT, ], extra="forbid", ): diff --git a/docling_core/types/rec/predicate.py b/docling_core/types/rec/predicate.py index dce5033d..91c62960 100644 --- a/docling_core/types/rec/predicate.py +++ b/docling_core/types/rec/predicate.py @@ -5,7 +5,7 @@ """Define the model Predicate.""" from datetime import datetime -from typing import Annotated, Generic, Optional, TypeVar +from typing import Annotated, Generic, Optional from pydantic import ( BaseModel, @@ -17,16 +17,14 @@ ) from docling_core.search.mapping import es_field +from docling_core.types.base import ( + Coordinates, + PredicateKeyNameT, + PredicateKeyTypeT, + PredicateValueTypeT, +) from docling_core.utils.alias import AliasModel -PredicateValueTypeT = TypeVar("PredicateValueTypeT", bound=str) -PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str) -PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str) -Coordinates = Annotated[ - list[float], - Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")), -] - class NumericalValue(BaseModel, extra="forbid"): """Model for numerical values.""" @@ -117,7 +115,7 @@ class PredicateValue(AliasModel, Generic[PredicateValueTypeT], extra="forbid"): class Predicate( - BaseModel, + AliasModel, Generic[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT], extra="forbid", ): diff --git a/docling_core/types/rec/record.py b/docling_core/types/rec/record.py index 05da3f29..864e9265 100644 --- a/docling_core/types/rec/record.py +++ b/docling_core/types/rec/record.py @@ -80,8 +80,6 @@ class Record( PredicateKeyNameT, PredicateKeyTypeT, ProvenanceTypeT, - SubjectTypeT, - SubjectNameTypeT, ] ] ] = None diff --git a/docling_core/types/rec/subject.py b/docling_core/types/rec/subject.py index 75413782..69d2e886 100644 --- a/docling_core/types/rec/subject.py +++ b/docling_core/types/rec/subject.py @@ -19,6 +19,10 @@ from docling_core.utils.alias import AliasModel +class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]): + """Identifier of subject names.""" "" + + class Subject( AliasModel, Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT], @@ -53,7 +57,7 @@ class Subject( ), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) - names: list[Identifier[SubjectNameTypeT]] = Field( + names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field( description=( "List of given names for this subject. They may not be unique across " "different subjects." diff --git a/docling_core/utils/ds_generate_docs.py b/docling_core/utils/ds_generate_docs.py index b5ed061c..83d77148 100644 --- a/docling_core/utils/ds_generate_docs.py +++ b/docling_core/utils/ds_generate_docs.py @@ -44,7 +44,7 @@ def _prepare_directory(folder: str, clean: bool = False) -> None: def generate_collection_jsonschema(folder: str): - """Generate the JSON schema of Deep Search collections and export them to a folder. + """Generate the JSON schema of Docling collections and export them to a folder. Args: folder: The name of the directory. @@ -58,7 +58,7 @@ def generate_collection_jsonschema(folder: str): def generate_collection_html(folder: str): - """Generate HTML pages documenting the data model of Deep Search collections. + """Generate HTML pages documenting the data model of Docling collections. The JSON schemas files need to be in a folder and the generated HTML pages will be written in the same folder. @@ -79,7 +79,7 @@ def generate_collection_html(folder: str): def generate_collection_markdown(folder: str): - """Generate Markdown pages documenting the data model of Deep Search collections. + """Generate Markdown pages documenting the data model of Docling collections. The JSON schemas files need to be in a folder and the generated markdown pages will be written in the same folder. @@ -101,7 +101,7 @@ def generate_collection_markdown(folder: str): def main() -> None: - """Generate the JSON Schema of Deep Search collections and export documentation.""" + """Generate the JSON Schema of Docling collections and export documentation.""" argparser = argparse.ArgumentParser() argparser.add_argument( "directory", diff --git a/docs/Document.json b/docs/Document.json index fdea4f9e..ffa00c6d 100644 --- a/docs/Document.json +++ b/docs/Document.json @@ -781,7 +781,7 @@ "x-es-type": "keyword" }, "document-hash": { - "description": "A unique identifier of this data object within a collection of a Deep Search database", + "description": "A unique identifier of this data object within a collection of a Docling database", "title": "Document-Hash", "type": "string", "x-es-ignore_above": 8191, @@ -1086,7 +1086,7 @@ }, "Identifier": { "additionalProperties": false, - "description": "Unique identifier of a Deep Search data object.", + "description": "Unique identifier of a Docling data object.", "properties": { "type": { "description": "A string representing a collection or database that contains this data object.", @@ -1103,7 +1103,7 @@ "x-es-type": "keyword" }, "_name": { - "description": "A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#).", + "description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).", "pattern": "^.+#.+$", "title": "_Name", "type": "string", @@ -1139,7 +1139,7 @@ "x-es-type": "keyword" }, "agent": { - "description": "The Deep Search agent that performed the task, e.g., CCS or CXS.", + "description": "The Docling agent that performed the task, e.g., CCS or CXS.", "title": "Agent", "type": "string", "x-es-ignore_above": 8191, @@ -1725,7 +1725,7 @@ "type": "object" } }, - "description": "Document model for Deep Search.", + "description": "Document model for Docling.", "properties": { "_name": { "title": " Name", diff --git a/docs/Document.md b/docs/Document.md index 804e74be..c5eccbe5 100644 --- a/docs/Document.md +++ b/docs/Document.md @@ -8,7 +8,7 @@ | **Required** | No | | **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") | -**Description:** Document model for Deep Search. +**Description:** Document model for Docling.
@@ -1318,9 +1318,9 @@ | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| ---------------------------------------------------- | ----------------------------------------------- | -| [Identifier](#description_references_anyOf_i0_items) | Unique identifier of a Deep Search data object. | +| Each item of this array must be | Description | +| ---------------------------------------------------- | ------------------------------------------- | +| [Identifier](#description_references_anyOf_i0_items) | Unique identifier of a Docling data object. | ##### 3.12.1.1. ExportedCCSDocument > description > references > anyOf > item 0 > Identifier @@ -1331,7 +1331,7 @@ | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -1385,7 +1385,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | @@ -1510,9 +1510,9 @@ | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| -------------------------------------------------------------------------------- | ----------------------------------------------- | -| [Identifier](#description_publication_anyOf_i0_items_identifiers_anyOf_i0_items) | Unique identifier of a Deep Search data object. | +| Each item of this array must be | Description | +| -------------------------------------------------------------------------------- | ------------------------------------------- | +| [Identifier](#description_publication_anyOf_i0_items_identifiers_anyOf_i0_items) | Unique identifier of a Docling data object. | ###### 3.13.1.1.1.1.1. ExportedCCSDocument > description > publication > anyOf > item 0 > Publication > identifiers > anyOf > item 0 > Identifier @@ -1523,7 +1523,7 @@ | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -1577,7 +1577,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | @@ -2350,7 +2350,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** The Deep Search agent that performed the task, e.g., CCS or CXS. +**Description:** The Docling agent that performed the task, e.g., CCS or CXS.
@@ -3046,7 +3046,7 @@ Must be one of: | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of this data object within a collection of a Deep Search database +**Description:** A unique identifier of this data object within a collection of a Docling database
@@ -9175,9 +9175,9 @@ Must be one of: | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| ----------------------------------------- | ----------------------------------------------- | -| [Identifier](#identifiers_anyOf_i0_items) | Unique identifier of a Deep Search data object. | +| Each item of this array must be | Description | +| ----------------------------------------- | ------------------------------------------- | +| [Identifier](#identifiers_anyOf_i0_items) | Unique identifier of a Docling data object. | #### 15.1.1. ExportedCCSDocument > identifiers > anyOf > item 0 > Identifier @@ -9188,7 +9188,7 @@ Must be one of: | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -9242,7 +9242,7 @@ Must be one of: | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | diff --git a/docs/Generic.json b/docs/Generic.json index b86d1307..6b150290 100644 --- a/docs/Generic.json +++ b/docs/Generic.json @@ -1,7 +1,7 @@ { "$defs": { "FileInfoObject": { - "description": "Filing information for any data object to be stored in a Deep Search database.", + "description": "Filing information for any data object to be stored in a Docling database.", "properties": { "filename": { "description": "The name of a persistent object that created this data object", @@ -26,7 +26,7 @@ "x-es-type": "keyword" }, "document-hash": { - "description": "A unique identifier of this data object within a collection of a Deep Search database", + "description": "A unique identifier of this data object within a collection of a Docling database", "title": "Document-Hash", "type": "string", "x-es-ignore_above": 8191, diff --git a/docs/Generic.md b/docs/Generic.md index 5c476db9..175187f3 100644 --- a/docs/Generic.md +++ b/docs/Generic.md @@ -163,7 +163,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of this data object within a collection of a Deep Search database +**Description:** A unique identifier of this data object within a collection of a Docling database
diff --git a/docs/Record.json b/docs/Record.json index f3326846..84ee5997 100644 --- a/docs/Record.json +++ b/docs/Record.json @@ -103,7 +103,7 @@ "predicates": { "description": "A list of characteristics (type, value, and name).", "items": { - "$ref": "#/$defs/Predicate__PredicateValueTypeT__PredicateKeyNameT__PredicateKeyTypeT_" + "$ref": "#/$defs/Predicate" }, "title": "Predicates", "type": "array" @@ -220,7 +220,7 @@ "type": "object" }, "FileInfoObject": { - "description": "Filing information for any data object to be stored in a Deep Search database.", + "description": "Filing information for any data object to be stored in a Docling database.", "properties": { "filename": { "description": "The name of a persistent object that created this data object", @@ -245,7 +245,7 @@ "x-es-type": "keyword" }, "document-hash": { - "description": "A unique identifier of this data object within a collection of a Deep Search database", + "description": "A unique identifier of this data object within a collection of a Docling database", "title": "Document-Hash", "type": "string", "x-es-ignore_above": 8191, @@ -297,7 +297,7 @@ }, "Identifier": { "additionalProperties": false, - "description": "Unique identifier of a Deep Search data object.", + "description": "Unique identifier of a Docling data object.", "properties": { "type": { "description": "A string representing a collection or database that contains this data object.", @@ -314,7 +314,7 @@ "x-es-type": "keyword" }, "_name": { - "description": "A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#).", + "description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).", "pattern": "^.+#.+$", "title": "_Name", "type": "string", @@ -330,40 +330,6 @@ "title": "Identifier", "type": "object" }, - "Identifier__SubjectNameTypeT_": { - "additionalProperties": false, - "properties": { - "type": { - "description": "A string representing a collection or database that contains this data object.", - "title": "Type", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - }, - "value": { - "description": "The identifier value of the data object within a collection or database.", - "title": "Value", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - }, - "_name": { - "description": "A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#).", - "pattern": "^.+#.+$", - "title": "_Name", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - } - }, - "required": [ - "type", - "value", - "_name" - ], - "title": "Identifier[~SubjectNameTypeT]", - "type": "object" - }, "Log": { "additionalProperties": false, "description": "Log entry to describe an ETL task on a document.", @@ -384,7 +350,7 @@ "x-es-type": "keyword" }, "agent": { - "description": "The Deep Search agent that performed the task, e.g., CCS or CXS.", + "description": "The Docling agent that performed the task, e.g., CCS or CXS.", "title": "Agent", "type": "string", "x-es-ignore_above": 8191, @@ -483,64 +449,15 @@ "title": "NumericalValue", "type": "object" }, - "PredicateKey__PredicateKeyNameT__PredicateKeyTypeT_": { - "additionalProperties": false, - "properties": { - "name": { - "description": "Name of the predicate key.", - "title": "Name", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - }, - "type": { - "description": "Type of predicate key.", - "title": "Type", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - } - }, - "required": [ - "name", - "type" - ], - "title": "PredicateKey[~PredicateKeyNameT, ~PredicateKeyTypeT]", - "type": "object" - }, - "PredicateValue__PredicateValueTypeT_": { - "additionalProperties": false, - "properties": { - "name": { - "description": "Name of the predicate value (actual value).", - "title": "Name", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - }, - "type": { - "description": "Type of predicate value.", - "title": "Type", - "type": "string", - "x-es-ignore_above": 8191, - "x-es-type": "keyword" - } - }, - "required": [ - "name", - "type" - ], - "title": "PredicateValue[~PredicateValueTypeT]", - "type": "object" - }, - "Predicate__PredicateValueTypeT__PredicateKeyNameT__PredicateKeyTypeT_": { + "Predicate": { "additionalProperties": false, + "description": "Model for a predicate.", "properties": { "key": { - "$ref": "#/$defs/PredicateKey__PredicateKeyNameT__PredicateKeyTypeT_" + "$ref": "#/$defs/PredicateKey" }, "value": { - "$ref": "#/$defs/PredicateValue__PredicateValueTypeT_" + "$ref": "#/$defs/PredicateValue" }, "numerical_value": { "anyOf": [ @@ -624,7 +541,59 @@ "key", "value" ], - "title": "Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT]", + "title": "Predicate", + "type": "object" + }, + "PredicateKey": { + "additionalProperties": false, + "description": "Model for the key (unique identifier) of a predicate.", + "properties": { + "name": { + "description": "Name of the predicate key.", + "title": "Name", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + }, + "type": { + "description": "Type of predicate key.", + "title": "Type", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + } + }, + "required": [ + "name", + "type" + ], + "title": "PredicateKey", + "type": "object" + }, + "PredicateValue": { + "additionalProperties": false, + "description": "Model for the value of a predicate.", + "properties": { + "name": { + "description": "Name of the predicate value (actual value).", + "title": "Name", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + }, + "type": { + "description": "Type of predicate value.", + "title": "Type", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + } + }, + "required": [ + "name", + "type" + ], + "title": "PredicateValue", "type": "object" }, "ProvenanceItem": { @@ -819,7 +788,7 @@ "names": { "description": "List of given names for this subject. They may not be unique across different subjects.", "items": { - "$ref": "#/$defs/Identifier__SubjectNameTypeT_" + "$ref": "#/$defs/SubjectNameIdentifier" }, "title": "Names", "type": "array" @@ -867,6 +836,41 @@ "title": "Subject", "type": "object" }, + "SubjectNameIdentifier": { + "additionalProperties": false, + "description": "Identifier of subject names.", + "properties": { + "type": { + "description": "A string representing a collection or database that contains this data object.", + "title": "Type", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + }, + "value": { + "description": "The identifier value of the data object within a collection or database.", + "title": "Value", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + }, + "_name": { + "description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).", + "pattern": "^.+#.+$", + "title": "_Name", + "type": "string", + "x-es-ignore_above": 8191, + "x-es-type": "keyword" + } + }, + "required": [ + "type", + "value", + "_name" + ], + "title": "SubjectNameIdentifier", + "type": "object" + }, "TextValue": { "additionalProperties": false, "description": "Model for textual values.", diff --git a/docs/Record.md b/docs/Record.md index 8b861b7e..95f86a79 100644 --- a/docs/Record.md +++ b/docs/Record.md @@ -211,7 +211,7 @@ | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -265,7 +265,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | @@ -429,7 +429,7 @@ | **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") | | **Defined in** | #/$defs/FileInfoObject | -**Description:** Filing information for any data object to be stored in a Deep Search database. +**Description:** Filing information for any data object to be stored in a Docling database.
@@ -515,7 +515,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of this data object within a collection of a Deep Search database +**Description:** A unique identifier of this data object within a collection of a Docling database
@@ -643,7 +643,7 @@ | **Type** | `string` | | **Required** | Yes | -**Description:** The Deep Search agent that performed the task, e.g., CCS or CXS. +**Description:** The Docling agent that performed the task, e.g., CCS or CXS.
@@ -1447,22 +1447,24 @@ Must be one of: | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| ----------------------------------------------------- | ----------- | -| [Identifier__SubjectNameTypeT_](#subject_names_items) | - | +| Each item of this array must be | Description | +| --------------------------------------------- | ---------------------------- | +| [SubjectNameIdentifier](#subject_names_items) | Identifier of subject names. | -#### 5.4.1. Record > subject > names > Identifier__SubjectNameTypeT_ +#### 5.4.1. Record > subject > names > SubjectNameIdentifier | | | | ------------------------- | ------------------------------------------------------- | | **Type** | `object` | | **Required** | No | | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | -| **Defined in** | #/$defs/Identifier__SubjectNameTypeT_ | +| **Defined in** | #/$defs/SubjectNameIdentifier | + +**Description:** Identifier of subject names.
- 5.4.1.1. [Required] Property Record > subject > names > Identifier[~SubjectNameTypeT] > type + 5.4.1.1. [Required] Property Record > subject > names > SubjectNameIdentifier > type
@@ -1481,7 +1483,7 @@ Must be one of:
- 5.4.1.2. [Required] Property Record > subject > names > Identifier[~SubjectNameTypeT] > value + 5.4.1.2. [Required] Property Record > subject > names > SubjectNameIdentifier > value
@@ -1500,7 +1502,7 @@ Must be one of:
- 5.4.1.3. [Required] Property Record > subject > names > Identifier[~SubjectNameTypeT] > _name + 5.4.1.3. [Required] Property Record > subject > names > SubjectNameIdentifier > _name
@@ -1512,7 +1514,7 @@ Must be one of: | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | @@ -1566,9 +1568,9 @@ Must be one of: | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| ------------------------------------------------- | ----------------------------------------------- | -| [Identifier](#subject_identifiers_anyOf_i0_items) | Unique identifier of a Deep Search data object. | +| Each item of this array must be | Description | +| ------------------------------------------------- | ------------------------------------------- | +| [Identifier](#subject_identifiers_anyOf_i0_items) | Unique identifier of a Docling data object. | ##### 5.5.1.1. Record > subject > identifiers > anyOf > item 0 > Identifier @@ -1579,7 +1581,7 @@ Must be one of: | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -1633,7 +1635,7 @@ Must be one of: | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | @@ -2006,7 +2008,7 @@ Must be one of: | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -2060,7 +2062,7 @@ Must be one of: | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | @@ -2248,36 +2250,40 @@ Must be one of: | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| -------------------------------------------------------------------------------------------------------------------- | ----------- | -| [Predicate__PredicateValueTypeT__PredicateKeyNameT__PredicateKeyTypeT_](#attributes_anyOf_i0_items_predicates_items) | - | +| Each item of this array must be | Description | +| -------------------------------------------------------- | ---------------------- | +| [Predicate](#attributes_anyOf_i0_items_predicates_items) | Model for a predicate. | + +###### 6.1.1.3.1. Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate -###### 6.1.1.3.1. Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate__PredicateValueTypeT__PredicateKeyNameT__PredicateKeyTypeT_ +| | | +| ------------------------- | ------------------------------------------------------- | +| **Type** | `object` | +| **Required** | No | +| **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | +| **Defined in** | #/$defs/Predicate | -| | | -| ------------------------- | ----------------------------------------------------------------------------- | -| **Type** | `object` | -| **Required** | No | -| **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | -| **Defined in** | #/$defs/Predicate__PredicateValueTypeT__PredicateKeyNameT__PredicateKeyTypeT_ | +**Description:** Model for a predicate.
- 6.1.1.3.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > key + 6.1.1.3.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > key
-| | | -| ------------------------- | ----------------------------------------------------------- | -| **Type** | `object` | -| **Required** | Yes | -| **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | -| **Defined in** | #/$defs/PredicateKey__PredicateKeyNameT__PredicateKeyTypeT_ | +| | | +| ------------------------- | ------------------------------------------------------- | +| **Type** | `object` | +| **Required** | Yes | +| **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | +| **Defined in** | #/$defs/PredicateKey | + +**Description:** Model for the key (unique identifier) of a predicate.
- 6.1.1.3.1.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > key > name + 6.1.1.3.1.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > key > name
@@ -2296,7 +2302,7 @@ Must be one of:
- 6.1.1.3.1.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > key > type + 6.1.1.3.1.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > key > type
@@ -2318,7 +2324,7 @@ Must be one of:
- 6.1.1.3.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > value + 6.1.1.3.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > value
@@ -2328,11 +2334,13 @@ Must be one of: | **Type** | `object` | | **Required** | Yes | | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | -| **Defined in** | #/$defs/PredicateValue__PredicateValueTypeT_ | +| **Defined in** | #/$defs/PredicateValue | + +**Description:** Model for the value of a predicate.
- 6.1.1.3.1.2.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > value > name + 6.1.1.3.1.2.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > value > name
@@ -2351,7 +2359,7 @@ Must be one of:
- 6.1.1.3.1.2.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > value > type + 6.1.1.3.1.2.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > value > type
@@ -2373,7 +2381,7 @@ Must be one of:
- 6.1.1.3.1.3. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value + 6.1.1.3.1.3. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value
@@ -2394,7 +2402,7 @@ Must be one of:
-###### 6.1.1.3.1.3.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > NumericalValue` +###### 6.1.1.3.1.3.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > NumericalValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2407,7 +2415,7 @@ Must be one of:
- 6.1.1.3.1.3.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > NumericalValue > min + 6.1.1.3.1.3.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > NumericalValue > min
@@ -2424,7 +2432,7 @@ Must be one of:
- 6.1.1.3.1.3.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > NumericalValue > max + 6.1.1.3.1.3.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > NumericalValue > max
@@ -2441,7 +2449,7 @@ Must be one of:
- 6.1.1.3.1.3.1.3. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > NumericalValue > val + 6.1.1.3.1.3.1.3. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > NumericalValue > val
@@ -2458,7 +2466,7 @@ Must be one of:
- 6.1.1.3.1.3.1.4. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > NumericalValue > err + 6.1.1.3.1.3.1.4. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > NumericalValue > err
@@ -2475,7 +2483,7 @@ Must be one of:
- 6.1.1.3.1.3.1.5. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > NumericalValue > unit + 6.1.1.3.1.3.1.5. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > NumericalValue > unit
@@ -2493,7 +2501,7 @@ Must be one of:
-###### 6.1.1.3.1.3.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value > anyOf > item 1` +###### 6.1.1.3.1.3.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value > anyOf > item 1` | | | | ------------ | ------ | @@ -2509,7 +2517,7 @@ Must be one of:
- 6.1.1.3.1.4. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si + 6.1.1.3.1.4. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si
@@ -2530,7 +2538,7 @@ Must be one of:
-###### 6.1.1.3.1.4.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > NumericalValue` +###### 6.1.1.3.1.4.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > NumericalValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2543,7 +2551,7 @@ Must be one of:
- 6.1.1.3.1.4.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > NumericalValue > min + 6.1.1.3.1.4.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > NumericalValue > min
@@ -2560,7 +2568,7 @@ Must be one of:
- 6.1.1.3.1.4.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > NumericalValue > max + 6.1.1.3.1.4.1.2. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > NumericalValue > max
@@ -2577,7 +2585,7 @@ Must be one of:
- 6.1.1.3.1.4.1.3. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > NumericalValue > val + 6.1.1.3.1.4.1.3. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > NumericalValue > val
@@ -2594,7 +2602,7 @@ Must be one of:
- 6.1.1.3.1.4.1.4. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > NumericalValue > err + 6.1.1.3.1.4.1.4. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > NumericalValue > err
@@ -2611,7 +2619,7 @@ Must be one of:
- 6.1.1.3.1.4.1.5. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > NumericalValue > unit + 6.1.1.3.1.4.1.5. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > NumericalValue > unit
@@ -2629,7 +2637,7 @@ Must be one of:
-###### 6.1.1.3.1.4.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > numerical_value_si > anyOf > item 1` +###### 6.1.1.3.1.4.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > numerical_value_si > anyOf > item 1` | | | | ------------ | ------ | @@ -2645,7 +2653,7 @@ Must be one of:
- 6.1.1.3.1.5. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > nominal_value + 6.1.1.3.1.5. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > nominal_value
@@ -2666,7 +2674,7 @@ Must be one of:
-###### 6.1.1.3.1.5.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > nominal_value > anyOf > NominalValue` +###### 6.1.1.3.1.5.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > nominal_value > anyOf > NominalValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2679,7 +2687,7 @@ Must be one of:
- 6.1.1.3.1.5.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > nominal_value > anyOf > NominalValue > value + 6.1.1.3.1.5.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > nominal_value > anyOf > NominalValue > value
@@ -2697,7 +2705,7 @@ Must be one of:
-###### 6.1.1.3.1.5.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > nominal_value > anyOf > item 1` +###### 6.1.1.3.1.5.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > nominal_value > anyOf > item 1` | | | | ------------ | ------ | @@ -2713,7 +2721,7 @@ Must be one of:
- 6.1.1.3.1.6. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > text_value + 6.1.1.3.1.6. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > text_value
@@ -2734,7 +2742,7 @@ Must be one of:
-###### 6.1.1.3.1.6.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > text_value > anyOf > TextValue` +###### 6.1.1.3.1.6.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > text_value > anyOf > TextValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2747,7 +2755,7 @@ Must be one of:
- 6.1.1.3.1.6.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > text_value > anyOf > TextValue > value + 6.1.1.3.1.6.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > text_value > anyOf > TextValue > value
@@ -2765,7 +2773,7 @@ Must be one of:
-###### 6.1.1.3.1.6.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > text_value > anyOf > item 1` +###### 6.1.1.3.1.6.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > text_value > anyOf > item 1` | | | | ------------ | ------ | @@ -2781,7 +2789,7 @@ Must be one of:
- 6.1.1.3.1.7. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > boolean_value + 6.1.1.3.1.7. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > boolean_value
@@ -2802,7 +2810,7 @@ Must be one of:
-###### 6.1.1.3.1.7.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > boolean_value > anyOf > BooleanValue` +###### 6.1.1.3.1.7.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > boolean_value > anyOf > BooleanValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2815,7 +2823,7 @@ Must be one of:
- 6.1.1.3.1.7.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > boolean_value > anyOf > BooleanValue > value + 6.1.1.3.1.7.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > boolean_value > anyOf > BooleanValue > value
@@ -2833,7 +2841,7 @@ Must be one of:
-###### 6.1.1.3.1.7.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > boolean_value > anyOf > item 1` +###### 6.1.1.3.1.7.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > boolean_value > anyOf > item 1` | | | | ------------ | ------ | @@ -2849,7 +2857,7 @@ Must be one of:
- 6.1.1.3.1.8. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > datetime_value + 6.1.1.3.1.8. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > datetime_value
@@ -2870,7 +2878,7 @@ Must be one of:
-###### 6.1.1.3.1.8.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > datetime_value > anyOf > DatetimeValue` +###### 6.1.1.3.1.8.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > datetime_value > anyOf > DatetimeValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2883,7 +2891,7 @@ Must be one of:
- 6.1.1.3.1.8.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > datetime_value > anyOf > DatetimeValue > value + 6.1.1.3.1.8.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > datetime_value > anyOf > DatetimeValue > value
@@ -2902,7 +2910,7 @@ Must be one of:
-###### 6.1.1.3.1.8.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > datetime_value > anyOf > item 1` +###### 6.1.1.3.1.8.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > datetime_value > anyOf > item 1` | | | | ------------ | ------ | @@ -2918,7 +2926,7 @@ Must be one of:
- 6.1.1.3.1.9. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value + 6.1.1.3.1.9. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value
@@ -2939,7 +2947,7 @@ Must be one of:
-###### 6.1.1.3.1.9.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > GeopointValue` +###### 6.1.1.3.1.9.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > GeopointValue` | | | | ------------------------- | ------------------------------------------------------- | @@ -2952,7 +2960,7 @@ Must be one of:
- 6.1.1.3.1.9.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > GeopointValue > value + 6.1.1.3.1.9.1.1. [Required] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > GeopointValue > value
@@ -2976,7 +2984,7 @@ Must be one of: | ---------------------------------------------------------------------------------------------- | ----------- | | [value items](#attributes_anyOf_i0_items_predicates_items_geopoint_value_anyOf_i0_value_items) | - | -###### 6.1.1.3.1.9.1.1.1. Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > GeopointValue > value > value items +###### 6.1.1.3.1.9.1.1.1. Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > GeopointValue > value > value items | | | | ------------ | -------- | @@ -2988,7 +2996,7 @@ Must be one of:
- 6.1.1.3.1.9.1.2. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > GeopointValue > conf + 6.1.1.3.1.9.1.2. [Optional] Property Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > GeopointValue > conf
@@ -3011,7 +3019,7 @@ Must be one of:
-###### 6.1.1.3.1.9.1.2.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > GeopointValue > conf > anyOf > item 0` +###### 6.1.1.3.1.9.1.2.1. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > GeopointValue > conf > anyOf > item 0` | | | | ------------ | -------- | @@ -3026,7 +3034,7 @@ Must be one of:
-###### 6.1.1.3.1.9.1.2.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > GeopointValue > conf > anyOf > item 1` +###### 6.1.1.3.1.9.1.2.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > GeopointValue > conf > anyOf > item 1` | | | | ------------ | ------ | @@ -3043,7 +3051,7 @@ Must be one of:
-###### 6.1.1.3.1.9.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate[~PredicateValueTypeT, ~PredicateKeyNameT, ~PredicateKeyTypeT] > geopoint_value > anyOf > item 1` +###### 6.1.1.3.1.9.2. Property `Record > attributes > anyOf > item 0 > Attribute > predicates > Predicate > geopoint_value > anyOf > item 1` | | | | ------------ | ------ | @@ -3170,9 +3178,9 @@ Must be one of: | **Additional items** | False | | **Tuple validation** | See below | -| Each item of this array must be | Description | -| ----------------------------------------- | ----------------------------------------------- | -| [Identifier](#identifiers_anyOf_i0_items) | Unique identifier of a Deep Search data object. | +| Each item of this array must be | Description | +| ----------------------------------------- | ------------------------------------------- | +| [Identifier](#identifiers_anyOf_i0_items) | Unique identifier of a Docling data object. | #### 8.1.1. Record > identifiers > anyOf > item 0 > Identifier @@ -3183,7 +3191,7 @@ Must be one of: | **Additional properties** | [[Not allowed]](# "Additional Properties not allowed.") | | **Defined in** | #/$defs/Identifier | -**Description:** Unique identifier of a Deep Search data object. +**Description:** Unique identifier of a Docling data object.
@@ -3237,7 +3245,7 @@ Must be one of: | **Type** | `string` | | **Required** | Yes | -**Description:** A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#). +**Description:** A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#). | Restrictions | | | --------------------------------- | ------------------------------------------------------------------- | diff --git a/pyproject.toml b/pyproject.toml index 350f4446..c69694e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,5 +110,5 @@ python_version = "3.9" plugins = ["pydantic.mypy"] [[tool.mypy.overrides]] -module = ["jsonref.*", "jsonschema.*", "json_schema_for_humans.*"] +module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*"] ignore_missing_imports = true diff --git a/test/data/json_schemas/base_identifier.json b/test/data/json_schemas/base_identifier.json index 3160a225..c2cb2233 100644 --- a/test/data/json_schemas/base_identifier.json +++ b/test/data/json_schemas/base_identifier.json @@ -1,6 +1,6 @@ { "title": "Identifier", - "description": "Unique identifier of a Deep Search data object.", + "description": "Unique identifier of a Docling data object.", "type": "object", "properties": { "type": { @@ -19,7 +19,7 @@ }, "_name": { "title": "_Name", - "description": "A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#).", + "description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).", "x-es-type": "keyword", "x-es-ignore_above": 8191, "pattern": "^.+#.+$", diff --git a/test/data/json_schemas/base_log.json b/test/data/json_schemas/base_log.json index 8ace2399..6dc8cd71 100644 --- a/test/data/json_schemas/base_log.json +++ b/test/data/json_schemas/base_log.json @@ -18,7 +18,7 @@ "x-es-type": "keyword" }, "agent": { - "description": "The Deep Search agent that performed the task, e.g., CCS or CXS.", + "description": "The Docling agent that performed the task, e.g., CCS or CXS.", "title": "Agent", "type": "string", "x-es-ignore_above": 8191, diff --git a/test/data/rec/statement-02.json b/test/data/rec/statement-02.json index 7a332e6e..a28257d4 100644 --- a/test/data/rec/statement-02.json +++ b/test/data/rec/statement-02.json @@ -51,7 +51,7 @@ ], "type": "statement", "subtype": "mat_to_prop_to_pvls", - "model": "Deep Search Model 0.0.0", + "model": "Docling Model 0.0.0", "source": "sentence.3", "match": "89f0d4058c2483678b2cc4f515acf463", "range": [ diff --git a/test/test_base.py b/test/test_base.py index 879f77c7..89cda5a1 100644 --- a/test/test_base.py +++ b/test/test_base.py @@ -5,19 +5,19 @@ """Test the pydantic models in module data_types.base.py.""" import json +from datetime import datetime, timezone +from typing import Literal import pytest -from datetime import datetime, timezone from pydantic import BaseModel, ValidationError -from typing import Literal from docling_core.types.base import ( - Identifier, - Log, - FileInfoObject, - CollectionInfo, CollectionDocumentInfo, + CollectionInfo, CollectionRecordInfo, + FileInfoObject, + Identifier, + Log, StrictDateTime, ) from docling_core.types.doc.document import CCSDocumentDescription @@ -31,7 +31,9 @@ def test_identifier(): # dict(): important to set by_alias=True, if the model has aliases assert data.model_dump(by_alias=True) == gold_dict - assert data.model_dump_json(by_alias=True, indent=2) == json.dumps(gold_dict, indent=2) + assert data.model_dump_json(by_alias=True, indent=2) == json.dumps( + gold_dict, indent=2 + ) # schema_json(): no need to set by_alias since it is True by the default tf = open("test/data/json_schemas/base_identifier.json") @@ -71,16 +73,21 @@ def test_log(): Log(agent="CXS", type="annotation", date=datetime.now()) Log( - task="run 3", agent="CXS", type="annotation", comment="UCMI 3.10", - date="2021-11-03T04:42:54.844631+00:00") + task="run 3", + agent="CXS", + type="annotation", + comment="UCMI 3.10", + date="2021-11-03T04:42:54.844631+00:00", + ) data = Log( - task=None, agent="CXS", type="parsing", - date="2021-11-03T04:42:54.844631+00:00") + task=None, agent="CXS", type="parsing", date="2021-11-03T04:42:54.844631+00:00" + ) gold_dict = { "agent": "CXS", "type": "parsing", - "date": "2021-11-03T04:42:54.844631+00:00"} + "date": "2021-11-03T04:42:54.844631+00:00", + } # None values will be exported, use exclude_none=True to export clean assert data.model_dump() != gold_dict assert data.model_dump(exclude_none=True, by_alias=True) == gold_dict @@ -90,13 +97,20 @@ def test_log(): # Models that inherit from AliasModel will generate data with alias field names assert Log(**gold_dict).model_dump(exclude_unset=True) == gold_dict # ***Best practice***: exclude_unset=True, exclude_none=True, by_alias=True - assert Log(**gold_dict).model_dump(exclude_unset=True, exclude_none=True, by_alias=True) == gold_dict + assert ( + Log(**gold_dict).model_dump( + exclude_unset=True, exclude_none=True, by_alias=True + ) + == gold_dict + ) with open("test/data/json_schemas/base_log.json") as tf: gold_json_schema = json.load(tf) assert Log.model_json_schema() == gold_json_schema - with pytest.raises(ValidationError, match="Value type must be a datetime or a non-numeric string"): + with pytest.raises( + ValidationError, match="Value type must be a datetime or a non-numeric string" + ): Log(agent="CXS", type="annotation", date=123456789) @@ -105,7 +119,8 @@ def test_file_info_object(): gold_dict = { "filename": "document.pdf", "filename-prov": "http:www.ibm.com", - "document-hash": "PnNF3Fhr22nJH4a"} + "document-hash": "PnNF3Fhr22nJH4a", + } data = FileInfoObject(**gold_dict) # dictionaries and JSON exports need to explicitly use aliases, but children from AliasModel don't. assert data.model_dump(by_alias=True) == gold_dict @@ -113,7 +128,9 @@ def test_file_info_object(): gold_dict.pop("filename-prov") gold_json = json.dumps(gold_dict) - FileInfoObject(**gold_dict).model_dump_json(exclude_unset=True, exclude_none=True) == gold_json + FileInfoObject(**gold_dict).model_dump_json( + exclude_unset=True, exclude_none=True + ) == gold_json # creating an instance with input variables requires the use of field names. Since # document-hash is an invalid function parameter name, 'populate_by_name' needs to @@ -129,7 +146,7 @@ def test_collection_info(): "name": "patent USPTO", "type": "Document", "version": "3.2.0", - "alias": ["patent"] + "alias": ["patent"], } data = CollectionInfo(**gold_dict) assert data.model_dump(exclude_unset=True, exclude_none=True) == gold_dict @@ -139,7 +156,7 @@ def test_collection_info(): "name": "patent USPTO", "type": "experiment", "version": "3.2.0", - "alias": ["simulation"] + "alias": ["simulation"], } with pytest.raises(ValidationError, match="type"): CollectionInfo(**gold_dict) @@ -149,18 +166,23 @@ def test_collection_info(): "name": "patent USPTO", "type": "Document", "version": "3.2.0", - "alias": None - } - clean_dict = { - "name": "patent USPTO", - "type": "Document", - "version": "3.2.0" + "alias": None, } + clean_dict = {"name": "patent USPTO", "type": "Document", "version": "3.2.0"} data = CollectionInfo(**input_dict) - assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) != input_dict - assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == clean_dict + assert ( + data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) + != input_dict + ) + assert ( + data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) + == clean_dict + ) data = CollectionInfo(**clean_dict) - assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == clean_dict + assert ( + data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) + == clean_dict + ) def test_collection_document_info(): @@ -169,22 +191,30 @@ def test_collection_document_info(): "name": "patent USPTO", "type": "Document", "version": "3.2.0", - "alias": ["patent"] + "alias": ["patent"], } data = CollectionDocumentInfo(**gold_dict) - assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == gold_dict + assert ( + data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) + == gold_dict + ) # within dictionary desc_dict = { - "logs": [{ - "date": "2021-11-03T04:42:54.844631+00:00", - "agent": "CXS", - "type": "parsing"}], + "logs": [ + { + "date": "2021-11-03T04:42:54.844631+00:00", + "agent": "CXS", + "type": "parsing", + } + ], "collection": { "name": "patent USPTO", "type": "Document", "version": "3.2.0", - "alias": ["patent"]}} + "alias": ["patent"], + }, + } CCSDocumentDescription(**desc_dict) desc_dict["collection"]["type"] = "Record" @@ -198,22 +228,30 @@ def test_collection_record_info(): "name": "PubChem", "type": "Record", "version": "3.2.0", - "alias": ["chemical", "Material Sciences"] + "alias": ["chemical", "Material Sciences"], } data = CollectionRecordInfo(**gold_dict) - assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == gold_dict + assert ( + data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) + == gold_dict + ) # within dictionary desc_dict = { - "logs": [{ - "date": "2021-11-03T04:42:54.844631+00:00", - "agent": "CXS", - "type": "parsing"}], + "logs": [ + { + "date": "2021-11-03T04:42:54.844631+00:00", + "agent": "CXS", + "type": "parsing", + } + ], "collection": { "name": "PubChem", "type": "Record", "version": "3.2.0", - "alias": ["chemical", "Material Sciences"]}} + "alias": ["chemical", "Material Sciences"], + }, + } RecordDescription(**desc_dict) desc_dict["collection"]["type"] = "Document" diff --git a/test/test_collection.py b/test/test_collection.py index 6e45c5bc..a50d959d 100644 --- a/test/test_collection.py +++ b/test/test_collection.py @@ -5,37 +5,35 @@ """Test the pydantic models in module types.""" import glob + import pytest from pydantic import ValidationError -from docling_core.types import Generic, Document, Record +from docling_core.types import Document, Generic, Record def test_generic(): """Test the Generic model.""" - input_generic_0 = { + input_generic_0 = { "file-info": { "filename": "abc.xml", "filename-prov": "abc.xml.zip", - "document-hash": "123457889" - }, + "document-hash": "123457889", + }, "_name": "The ABC doc", - "custom": ["The custom ABC content 1.", "The custom ABC content 2."] - } + "custom": ["The custom ABC content 1.", "The custom ABC content 2."], + } Generic.model_validate(input_generic_0) - input_generic_1 = { - "file-info": { - "filename": "abc.xml", - "document-hash": "123457889" - }, - "_name": "The ABC doc" - } + input_generic_1 = { + "file-info": {"filename": "abc.xml", "document-hash": "123457889"}, + "_name": "The ABC doc", + } Generic.model_validate(input_generic_1) - input_generic_2 = { + input_generic_2 = { "_name": "The ABC doc", - "custom": ["The custom ABC content 1.", "The custom ABC content 2."] + "custom": ["The custom ABC content 1.", "The custom ABC content 2."], } with pytest.raises(ValidationError): Generic.model_validate(input_generic_2) diff --git a/test/test_doc_base.py b/test/test_doc_base.py index 33348974..d5a48ff5 100644 --- a/test/test_doc_base.py +++ b/test/test_doc_base.py @@ -3,10 +3,11 @@ # SPDX-License-Identifier: MIT # -from typing import ByteString -from pydantic import ValidationError + import pytest -from docling_core.types.doc.base import S3Reference, Prov +from pydantic import ValidationError + +from docling_core.types.doc.base import Prov, S3Reference def test_s3_reference(): @@ -20,12 +21,18 @@ def test_s3_reference(): with pytest.raises(ValidationError, match="required"): S3Reference() + def test_prov(): prov = { - "bbox": [48.19645328521729, 644.2883926391602, 563.6185592651367, 737.4546043395997], - "page": 2, - "span": [0, 0] - } + "bbox": [ + 48.19645328521729, + 644.2883926391602, + 563.6185592651367, + 737.4546043395997, + ], + "page": 2, + "span": [0, 0], + } assert Prov(**prov) diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py index 15bb4223..a899b084 100644 --- a/test/test_doc_schema.py +++ b/test/test_doc_schema.py @@ -4,8 +4,10 @@ # """Test the pydantic models in module data_types.ccs.""" -import json import glob +import json +from typing import Optional + import pytest from pydantic import BaseModel, ValidationError @@ -18,7 +20,7 @@ from docling_core.types.doc.document import ( CCSDocument, CCSDocumentDescription, - Publication + Publication, ) @@ -46,21 +48,29 @@ def test_ccs_document(): except ValidationError as e: for error in e.errors(): print(type(error)) - assert all(item in error["loc"] for item in ('description', 'logs')), \ - f"Data in file {filename} should fail in logs" + assert all( + item in error["loc"] for item in ("description", "logs") + ), f"Data in file {filename} should fail in logs" # check doc-error-2 is invalid for missing page-hashes - with pytest.raises(ValidationError, match="page-hashes"), open("test/data/doc/error-2.json") as file_obj: + with ( + pytest.raises(ValidationError, match="page-hashes"), + open("test/data/doc/error-2.json") as file_obj, + ): file_json = file_obj.read() CCSDocument.model_validate_json(file_json) # check doc-error-3 is invalid for wrong types in citation_count and reference_count - with pytest.raises(ValidationError, match="count"), open("test/data/doc/error-3.json") as file_obj: + with ( + pytest.raises(ValidationError, match="count"), + open("test/data/doc/error-3.json") as file_obj, + ): file_json = file_obj.read() CCSDocument.model_validate_json(file_json) + def test_publication_journal(): - """"Validate data with Publication model.""" + """ "Validate data with Publication model.""" for filename in glob.glob("test/data/doc/intermediates/publication_*.json"): with open(filename) as file_obj: file_json = file_obj.read() @@ -84,12 +94,14 @@ def test_description_advanced_t(): # any dictionary is valid, since it is not parametrized CCSDocumentDescription(**desc, advanced={"serial": "CXS12345"}) CCSDocumentDescription(**desc, advanced={0: "CXS12345"}) - with pytest.raises(ValidationError, match="should be a valid dictionary or instance of BaseModel"): + with pytest.raises( + ValidationError, match="should be a valid dictionary or instance of BaseModel" + ): CCSDocumentDescription(**desc, advanced=False) class MyAdvanced(BaseModel): serial: str - comment: str | None = None + comment: Optional[str] = None # with a model and bound specification adv_inst = MyAdvanced(serial="CXS12345", comment="public document") @@ -99,13 +111,37 @@ class MyAdvanced(BaseModel): # with a model and generic type specification advanced = MyAdvanced(serial="CXS12345", comment="public document") - CCSDocumentDescription[MyAdvanced, DescriptionAnalyticsT, IdentifierTypeT, LanguageT, CollectionNameTypeT](**desc) - CCSDocumentDescription[MyAdvanced, DescriptionAnalyticsT, IdentifierTypeT, LanguageT, CollectionNameTypeT](**desc, advanced=adv_inst) + CCSDocumentDescription[ + MyAdvanced, + DescriptionAnalyticsT, + IdentifierTypeT, + LanguageT, + CollectionNameTypeT, + ](**desc) + CCSDocumentDescription[ + MyAdvanced, + DescriptionAnalyticsT, + IdentifierTypeT, + LanguageT, + CollectionNameTypeT, + ](**desc, advanced=adv_inst) with pytest.raises(ValidationError, match="Field required"): - CCSDocumentDescription[MyAdvanced, DescriptionAnalyticsT, IdentifierTypeT, LanguageT, CollectionNameTypeT](**desc, advanced = {}) + CCSDocumentDescription[ + MyAdvanced, + DescriptionAnalyticsT, + IdentifierTypeT, + LanguageT, + CollectionNameTypeT, + ](**desc, advanced={}) # deriving a new type - MyDocument = CCSDocumentDescription[MyAdvanced, DescriptionAnalyticsT, IdentifierTypeT, LanguageT, CollectionNameTypeT] + MyDocument = CCSDocumentDescription[ + MyAdvanced, + DescriptionAnalyticsT, + IdentifierTypeT, + LanguageT, + CollectionNameTypeT, + ] MyDocument.model_validate(desc) desc["advanced"] = advanced MyDocument.model_validate(desc) diff --git a/test/test_doc_schema_extractor.py b/test/test_doc_schema_extractor.py index 736a1889..9f1f9d93 100644 --- a/test/test_doc_schema_extractor.py +++ b/test/test_doc_schema_extractor.py @@ -4,12 +4,12 @@ # """Test the pydantic models in module data_types.ccs.""" -from pydantic import ValidationError import json -from docling_core.types.doc.document import ( - CCSDocument, -) +from pydantic import ValidationError + +from docling_core.types.doc.document import CCSDocument + def test_ccs_document_update(): """Validate data with CCSDocument extract.""" diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py index 73b85e54..413e5b2a 100644 --- a/test/test_json_schema_to_search_mapper.py +++ b/test/test_json_schema_to_search_mapper.py @@ -4,17 +4,19 @@ # """Test the methods in module search.json_schema_to_search_mapper.""" -from docling_core.types.doc.document import ExportedCCSDocument -from docling_core.types.rec.record import Record -from docling_core.search.json_schema_to_search_mapper import JsonSchemaToSearchMapper - import json import os + import jsondiff +from docling_core.search.json_schema_to_search_mapper import JsonSchemaToSearchMapper +from docling_core.types.doc.document import ExportedCCSDocument +from docling_core.types.rec.record import Record + + def _load(filename): doc = {} - with open(filename, 'r') as fid: + with open(filename, "r") as fid: doc = json.load(fid) return doc @@ -44,44 +46,39 @@ def test_json_schema_to_search_mapper_0(): assert index_def is not None filename = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - 'data/json_schemas/document-ref.json' - ) + os.path.join(os.path.dirname(__file__), "data/json_schemas/document-ref.json") ) index_ref = _load(filename) diff = jsondiff.diff(index_ref, index_def) - print(json.dumps(index_def,indent=2)) + print(json.dumps(index_def, indent=2)) print(diff) assert index_def == index_ref - def test_json_schema_to_search_mapper_1(): """Test the class JsonSchemaToSearchMapper.""" s = Record.model_json_schema() print(json.dumps(s, indent=2)) _meta = { - "aliases": [".production","ccc"], + "aliases": [".production", "ccc"], "created": "2022-11-03T11:22:32.432+00:00", "description": "description of the collection", "source": "https://ccc", "storage": "storage location", "display_name": "display name", "type": "Record", - "classification": ["Public","PI"], - "version": [{"name": "my-library","version": "0.1.0-post.6+ed04c14"}, - {"name": "docling-core","version": "0.1.0"}], - "document_license": { - "code": ["NO-CC CODE","CC BY"], - "text": [ ] - }, + "classification": ["Public", "PI"], + "version": [ + {"name": "my-library", "version": "0.1.0-post.6+ed04c14"}, + {"name": "docling-core", "version": "0.1.0"}, + ], + "document_license": {"code": ["NO-CC CODE", "CC BY"], "text": []}, "license": "https://www.ccc", "filename": "ccc-gs.json", "domain": ["Healthcare & Life Sciences"], - "$ref": "ccs:schemas#/Document" + "$ref": "ccs:schemas#/Document", } mapper = JsonSchemaToSearchMapper( @@ -97,10 +94,7 @@ def test_json_schema_to_search_mapper_1(): assert index_def is not None filename = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - 'data/json_schemas/dbrecord-ref.json' - ) + os.path.join(os.path.dirname(__file__), "data/json_schemas/dbrecord-ref.json") ) index_ref = _load(filename) diff --git a/test/test_nlp_qa.py b/test/test_nlp_qa.py index 45bf13aa..27c671df 100644 --- a/test/test_nlp_qa.py +++ b/test/test_nlp_qa.py @@ -4,14 +4,15 @@ # """Test the pydantic models in module data_types.nlp.qa.py""" -import unittest import glob +import unittest + import pytest -from typing import Literal from pydantic import ValidationError from docling_core.types.nlp.qa import QAPair + class TestQAPair(unittest.TestCase): """Test QAPair model.""" @@ -29,11 +30,17 @@ def test_qapair_read(self): def test_qapair_wrong(self): """Validates wrong format from files.""" filename = "test/data/nlp/error-qa-1.json" - with pytest.raises(ValidationError, match="Input should be a valid string"), open(filename) as file_obj: + with ( + pytest.raises(ValidationError, match="Input should be a valid string"), + open(filename) as file_obj, + ): file_json = file_obj.read() QAPair.model_validate_json(file_json) filename = "test/data/nlp/error-qa-3.json" - with pytest.raises(ValidationError, match="List must be unique"), open(filename) as file_obj: + with ( + pytest.raises(ValidationError, match="List must be unique"), + open(filename) as file_obj, + ): file_json = file_obj.read() QAPair.model_validate_json(file_json) diff --git a/test/test_rec_schema.py b/test/test_rec_schema.py index a9dd3f86..ae7bcd0f 100644 --- a/test/test_rec_schema.py +++ b/test/test_rec_schema.py @@ -4,17 +4,18 @@ # """Test the pydantic models in module data_types.cxs.""" -import unittest import glob -import pytest +import unittest from typing import Literal + +import pytest from pydantic import ValidationError -from docling_core.types.rec.predicate import Predicate from docling_core.types.rec.attribute import Attribute -from docling_core.types.rec.subject import Subject -from docling_core.types.rec.statement import Statement +from docling_core.types.rec.predicate import Predicate from docling_core.types.rec.record import Record +from docling_core.types.rec.statement import Statement +from docling_core.types.rec.subject import Subject class TestCxsModel(unittest.TestCase): @@ -31,12 +32,18 @@ def test_predicates(self): def test_predicates_wrong(self): filename = "test/data/rec/error-predicate-01.json" - with pytest.raises(ValidationError, match="invalid latitude"), open(filename) as file_obj: + with ( + pytest.raises(ValidationError, match="invalid latitude"), + open(filename) as file_obj, + ): file_json = file_obj.read() Predicate.model_validate_json(file_json) filename = "test/data/rec/error-predicate-02.json" - with pytest.raises(ValidationError, match="geopoint_value.conf"), open(filename) as file_obj: + with ( + pytest.raises(ValidationError, match="geopoint_value.conf"), + open(filename) as file_obj, + ): file_json = file_obj.read() Predicate.model_validate_json(file_json) @@ -72,7 +79,9 @@ def test_subjects(self): def test_subjects2(self): """Validate data with Subject schema.""" # IdentifierTypeT, SubjectTypeT, SubjectNameTypeT - subject = Subject[Literal["db"],Literal["material"],Literal["chemical_name","sum_formula"]] + subject = Subject[ + Literal["db"], Literal["material"], Literal["chemical_name", "sum_formula"] + ] for filename in glob.glob("test/data/rec/subject-*.json"): try: with open(filename) as file_obj: @@ -85,17 +94,25 @@ def test_subjects2(self): def test_subjects_wrong(self): """Validate data with Subject schema.""" # IdentifierTypeT, SubjectTypeT, SubjectNameTypeT - subject = Subject[Literal["db_"],Literal["material"],Literal["chemical_name","sum_formula"]] + subject = Subject[ + Literal["db_"], Literal["material"], Literal["chemical_name", "sum_formula"] + ] for filename in glob.glob("test/data/rec/subject-*.json"): with self.assertRaises(ValidationError), open(filename) as file_obj: file_json = file_obj.read() subject.model_validate_json(file_json) - subject = Subject[Literal["db"],Literal["material_"],Literal["chemical_name","sum_formula"]] + subject = Subject[ + Literal["db"], Literal["material_"], Literal["chemical_name", "sum_formula"] + ] for filename in glob.glob("test/data/rec/subject-*.json"): with self.assertRaises(ValidationError), open(filename) as file_obj: file_json = file_obj.read() subject.model_validate_json(file_json) - subject = Subject[Literal["db"],Literal["material"],Literal["chemical_name_","sum_formula_"]] + subject = Subject[ + Literal["db"], + Literal["material"], + Literal["chemical_name_", "sum_formula_"], + ] for filename in glob.glob("test/data/rec/subject-*.json"): with self.assertRaises(ValidationError), open(filename) as file_obj: file_json = file_obj.read() @@ -126,14 +143,14 @@ def test_records(self): def test_records_2(self): """Validate data with Record schema by passing type parameters.""" record = Record[ - Literal["db"], # IdentifierTypeT, - Literal["property-value"], # PredicateValueTypeT - Literal["Tc","pressure"], # PredicateKeyNameT - Literal["property"], # PredicateKeyTypeT - Literal["sentence"], # ProvenanceTypeT - Literal["material"], # SubjectTypeT - Literal["chemical_name","sum_formula"], # SubjectNameTypeT - Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT + Literal["db"], # IdentifierTypeT, + Literal["property-value"], # PredicateValueTypeT + Literal["Tc", "pressure"], # PredicateKeyNameT + Literal["property"], # PredicateKeyTypeT + Literal["sentence"], # ProvenanceTypeT + Literal["material"], # SubjectTypeT + Literal["chemical_name", "sum_formula"], # SubjectNameTypeT + Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT ] for filename in glob.glob("test/data/rec/record-01.json"): try: @@ -147,22 +164,49 @@ def test_records_2(self): def test_records_3(self): """Validate data with Record schema by passing complex type parameters.""" record = Record[ - Literal[ - "arxivid", "cid", "cod", "doi", "db", "ent_id"], # IdentifierTypeT + Literal["arxivid", "cid", "cod", "doi", "db", "ent_id"], # IdentifierTypeT Literal["property-value"], # PredicateValueTypeT Literal[ - "space group", "cell symmetry", "cell length a", "cell length b", "cell length c", - "cell angle alpha", "cell angle beta", "cell angle gamma", "molecular weight", - "melting point", "boiling point", "density", "solubility", "temperature", - "solvent", "km_value", "turnover_number", "ph_optimum", "temperature_optimum", - "material-shape", "molecular", "material-state", "triangular lattice", "magnetic", - "hexagonal", "multi layer", "pressure",], # PredicateKeyNameT + "space group", + "cell symmetry", + "cell length a", + "cell length b", + "cell length c", + "cell angle alpha", + "cell angle beta", + "cell angle gamma", + "molecular weight", + "melting point", + "boiling point", + "density", + "solubility", + "temperature", + "solvent", + "km_value", + "turnover_number", + "ph_optimum", + "temperature_optimum", + "material-shape", + "molecular", + "material-state", + "triangular lattice", + "magnetic", + "hexagonal", + "multi layer", + "pressure", + ], # PredicateKeyNameT Literal["property"], # PredicateKeyTypeT Literal["url", "sentence"], # ProvenanceTypeT Literal["material"], # SubjectTypeT Literal[ - "chemical_name", "iupac_name", "sum_formula", "protein_name", "organism_name", - "taxon", "enzyme_class",], # SubjectNameTypeT, + "chemical_name", + "iupac_name", + "sum_formula", + "protein_name", + "organism_name", + "taxon", + "enzyme_class", + ], # SubjectNameTypeT, Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT ] try: @@ -174,32 +218,31 @@ def test_records_3(self): print(f"Validation error in file {filename}:\n{e.json()}") raise - def test_records_wrong(self): """Validate data with Record schema.""" record = Record[ - Literal["db"], # IdentifierTypeT, - Literal["property-value"], # PredicateValueTypeT - Literal["Tc","pressure"], # PredicateKeyNameT - Literal["property"], # PredicateKeyTypeT - Literal["database"], # ProvenanceTypeT - Literal["material"], # SubjectTypeT - Literal["chemical_name","sum_formula"], # SubjectNameTypeT - Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT + Literal["db"], # IdentifierTypeT, + Literal["property-value"], # PredicateValueTypeT + Literal["Tc", "pressure"], # PredicateKeyNameT + Literal["property"], # PredicateKeyTypeT + Literal["database"], # ProvenanceTypeT + Literal["material"], # SubjectTypeT + Literal["chemical_name", "sum_formula"], # SubjectNameTypeT + Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT ] for filename in glob.glob("test/data/rec/record-01.json"): with self.assertRaises(ValidationError), open(filename) as file_obj: file_json = file_obj.read() record.model_validate_json(file_json) record = Record[ - Literal["db"], # IdentifierTypeT, - Literal["property-value_"], # PredicateValueTypeT - Literal["Tc","pressure"], # PredicateKeyNameT - Literal["property"], # PredicateKeyTypeT - Literal["sentence"], # ProvenanceTypeT - Literal["material"], # SubjectTypeT - Literal["chemical_name","sum_formula"], # SubjectNameTypeT - Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT + Literal["db"], # IdentifierTypeT, + Literal["property-value_"], # PredicateValueTypeT + Literal["Tc", "pressure"], # PredicateKeyNameT + Literal["property"], # PredicateKeyTypeT + Literal["sentence"], # ProvenanceTypeT + Literal["material"], # SubjectTypeT + Literal["chemical_name", "sum_formula"], # SubjectNameTypeT + Literal["DB", "Chemicals", "ChemDatabase"], # CollectionNameTypeT ] for filename in glob.glob("test/data/rec/record-01.json"): with self.assertRaises(ValidationError), open(filename) as file_obj: diff --git a/test/test_search_meta.py b/test/test_search_meta.py index 89b0584d..7f72bac6 100644 --- a/test/test_search_meta.py +++ b/test/test_search_meta.py @@ -6,16 +6,19 @@ """Test the pydantic models in module search.metadata.py.""" import glob import os -from pydantic import ValidationError from typing import Literal +from pydantic import ValidationError + from docling_core.search.meta import Meta def test_meta(): """Validate data with Meta schema.""" taxonomy = Literal["Public", "PI"] - domain = Literal["Science", "Technology", "History", "Art", "Literature", "Geography"] + domain = Literal[ + "Science", "Technology", "History", "Art", "Literature", "Geography" + ] for filename in glob.glob("test/data/search/meta-*.json"): try: @@ -43,5 +46,4 @@ def test_meta(): except ValidationError as e: errors = e.errors() assert len(errors) == len(gold), f"Wrong number of errors in {filename}" - assert all( - errors[zdx]["loc"][0] == gold[zdx] for zdx in range(len(errors))) + assert all(errors[zdx]["loc"][0] == gold[zdx] for zdx in range(len(errors))) diff --git a/test/test_utils.py b/test/test_utils.py index 71d3f9f5..a27bce4e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,12 +5,12 @@ """Test the pydantic models in package utils.""" import json -import pytest -from pydantic import Field, ValidationError +from pydantic import Field from docling_core.utils.alias import AliasModel + def test_alias_model(): """Test the functionality of AliasModel."""