/
document_schema.py
256 lines (229 loc) · 9.63 KB
/
document_schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import annotations
from typing import MutableMapping, MutableSequence
import proto # type: ignore
__protobuf__ = proto.module(
package="google.cloud.documentai.v1",
manifest={
"DocumentSchema",
},
)
class DocumentSchema(proto.Message):
r"""The schema defines the output of the processed document by a
processor.
Attributes:
display_name (str):
Display name to show to users.
description (str):
Description of the schema.
entity_types (MutableSequence[google.cloud.documentai_v1.types.DocumentSchema.EntityType]):
Entity types of the schema.
metadata (google.cloud.documentai_v1.types.DocumentSchema.Metadata):
Metadata of the schema.
"""
class EntityType(proto.Message):
r"""EntityType is the wrapper of a label of the corresponding
model with detailed attributes and limitations for entity-based
processors. Multiple types can also compose a dependency tree to
represent nested types.
.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields
Attributes:
enum_values (google.cloud.documentai_v1.types.DocumentSchema.EntityType.EnumValues):
If specified, lists all the possible values for this entity.
This should not be more than a handful of values. If the
number of values is >10 or could change frequently use the
``EntityType.value_ontology`` field and specify a list of
all possible values in a value ontology file.
This field is a member of `oneof`_ ``value_source``.
display_name (str):
User defined name for the type.
name (str):
Name of the type. It must be unique within the schema file
and cannot be a "Common Type". The following naming
conventions are used:
- Use ``snake_casing``.
- Name matching is case-sensitive.
- Maximum 64 characters.
- Must start with a letter.
- Allowed characters: ASCII letters ``[a-z0-9_-]``. (For
backward compatibility internal infrastructure and
tooling can handle any ascii character.)
- The ``/`` is sometimes used to denote a property of a
type. For example ``line_item/amount``. This convention
is deprecated, but will still be honored for backward
compatibility.
base_types (MutableSequence[str]):
The entity type that this type is derived
from. For now, one and only one should be set.
properties (MutableSequence[google.cloud.documentai_v1.types.DocumentSchema.EntityType.Property]):
Description the nested structure, or
composition of an entity.
"""
class EnumValues(proto.Message):
r"""Defines the a list of enum values.
Attributes:
values (MutableSequence[str]):
The individual values that this enum values
type can include.
"""
values: MutableSequence[str] = proto.RepeatedField(
proto.STRING,
number=1,
)
class Property(proto.Message):
r"""Defines properties that can be part of the entity type.
Attributes:
name (str):
The name of the property. Follows the same
guidelines as the EntityType name.
value_type (str):
A reference to the value type of the property. This type is
subject to the same conventions as the ``Entity.base_types``
field.
occurrence_type (google.cloud.documentai_v1.types.DocumentSchema.EntityType.Property.OccurrenceType):
Occurrence type limits the number of
instances an entity type appears in the
document.
"""
class OccurrenceType(proto.Enum):
r"""Types of occurrences of the entity type in the document. This
represents the number of instances of an entity, not number of
mentions of an entity. For example, a bank statement may only have
one ``account_number``, but this account number may be mentioned in
several places on the document. In this case the 'account_number'
would be considered a ``REQUIRED_ONCE`` entity type. If, on the
other hand, we expect a bank statement to contain the status of
multiple different accounts for the customers, the occurrence type
will be set to ``REQUIRED_MULTIPLE``.
Values:
OCCURRENCE_TYPE_UNSPECIFIED (0):
Unspecified occurrence type.
OPTIONAL_ONCE (1):
There will be zero or one instance of this
entity type. The same entity instance may be
mentioned multiple times.
OPTIONAL_MULTIPLE (2):
The entity type will appear zero or multiple
times.
REQUIRED_ONCE (3):
The entity type will only appear exactly
once. The same entity instance may be mentioned
multiple times.
REQUIRED_MULTIPLE (4):
The entity type will appear once or more
times.
"""
OCCURRENCE_TYPE_UNSPECIFIED = 0
OPTIONAL_ONCE = 1
OPTIONAL_MULTIPLE = 2
REQUIRED_ONCE = 3
REQUIRED_MULTIPLE = 4
name: str = proto.Field(
proto.STRING,
number=1,
)
value_type: str = proto.Field(
proto.STRING,
number=2,
)
occurrence_type: "DocumentSchema.EntityType.Property.OccurrenceType" = (
proto.Field(
proto.ENUM,
number=3,
enum="DocumentSchema.EntityType.Property.OccurrenceType",
)
)
enum_values: "DocumentSchema.EntityType.EnumValues" = proto.Field(
proto.MESSAGE,
number=14,
oneof="value_source",
message="DocumentSchema.EntityType.EnumValues",
)
display_name: str = proto.Field(
proto.STRING,
number=13,
)
name: str = proto.Field(
proto.STRING,
number=1,
)
base_types: MutableSequence[str] = proto.RepeatedField(
proto.STRING,
number=2,
)
properties: MutableSequence[
"DocumentSchema.EntityType.Property"
] = proto.RepeatedField(
proto.MESSAGE,
number=6,
message="DocumentSchema.EntityType.Property",
)
class Metadata(proto.Message):
r"""Metadata for global schema behavior.
Attributes:
document_splitter (bool):
If true, a ``document`` entity type can be applied to
subdocument (splitting). Otherwise, it can only be applied
to the entire document (classification).
document_allow_multiple_labels (bool):
If true, on a given page, there can be multiple ``document``
annotations covering it.
prefixed_naming_on_properties (bool):
If set, all the nested entities must be
prefixed with the parents.
skip_naming_validation (bool):
If set, we will skip the naming format validation in the
schema. So the string values in
``DocumentSchema.EntityType.name`` and
``DocumentSchema.EntityType.Property.name`` will not be
checked.
"""
document_splitter: bool = proto.Field(
proto.BOOL,
number=1,
)
document_allow_multiple_labels: bool = proto.Field(
proto.BOOL,
number=2,
)
prefixed_naming_on_properties: bool = proto.Field(
proto.BOOL,
number=6,
)
skip_naming_validation: bool = proto.Field(
proto.BOOL,
number=7,
)
display_name: str = proto.Field(
proto.STRING,
number=1,
)
description: str = proto.Field(
proto.STRING,
number=2,
)
entity_types: MutableSequence[EntityType] = proto.RepeatedField(
proto.MESSAGE,
number=3,
message=EntityType,
)
metadata: Metadata = proto.Field(
proto.MESSAGE,
number=4,
message=Metadata,
)
__all__ = tuple(sorted(__protobuf__.manifest))