From 3dde6689132e19e261bcfe6b7331196d673c595a Mon Sep 17 00:00:00 2001 From: Miguel Grinberg Date: Fri, 30 May 2025 11:55:22 +0100 Subject: [PATCH] Fix some new type warnings from mypy (#2974) (cherry picked from commit 63efa48aabc353f806ef0a0b07add5130136fc5d) --- elasticsearch/dsl/_async/document.py | 2 +- elasticsearch/dsl/_sync/document.py | 2 +- elasticsearch/dsl/field.py | 12 ++++++- elasticsearch/dsl/query.py | 44 ++++++++++++++++++++++- elasticsearch/dsl/types.py | 54 ++++++++++++++++++++++------ 5 files changed, 100 insertions(+), 14 deletions(-) diff --git a/elasticsearch/dsl/_async/document.py b/elasticsearch/dsl/_async/document.py index 4b7654761..de6e9eecc 100644 --- a/elasticsearch/dsl/_async/document.py +++ b/elasticsearch/dsl/_async/document.py @@ -96,7 +96,7 @@ class AsyncDocument(DocumentBase, metaclass=AsyncIndexMeta): @classmethod def _get_using(cls, using: Optional[AsyncUsingType] = None) -> AsyncUsingType: - return cast(AsyncUsingType, using or cls._index._using) + return using or cls._index._using @classmethod def _get_connection( diff --git a/elasticsearch/dsl/_sync/document.py b/elasticsearch/dsl/_sync/document.py index 316ece5cb..f68be4aae 100644 --- a/elasticsearch/dsl/_sync/document.py +++ b/elasticsearch/dsl/_sync/document.py @@ -92,7 +92,7 @@ class Document(DocumentBase, metaclass=IndexMeta): @classmethod def _get_using(cls, using: Optional[UsingType] = None) -> UsingType: - return cast(UsingType, using or cls._index._using) + return using or cls._index._using @classmethod def _get_connection(cls, using: Optional[UsingType] = None) -> "Elasticsearch": diff --git a/elasticsearch/dsl/field.py b/elasticsearch/dsl/field.py index 726fbe358..e3ed5dfcd 100644 --- a/elasticsearch/dsl/field.py +++ b/elasticsearch/dsl/field.py @@ -1290,7 +1290,7 @@ def _deserialize(self, data: Any) -> Union[datetime, date]: if isinstance(data, datetime): if self._default_timezone and data.tzinfo is None: data = data.replace(tzinfo=self._default_timezone) - return data + return cast(datetime, data) if isinstance(data, date): return data if isinstance(data, int): @@ -3689,6 +3689,11 @@ class SemanticText(Field): by using the Update mapping API. Use the Create inference API to create the endpoint. If not specified, the inference endpoint defined by inference_id will be used at both index and query time. + :arg chunking_settings: Settings for chunking text into smaller + passages. If specified, these will override the chunking settings + sent in the inference endpoint associated with inference_id. If + chunking settings are updated, they will not be applied to + existing documents until they are reindexed. """ name = "semantic_text" @@ -3699,6 +3704,9 @@ def __init__( meta: Union[Mapping[str, str], "DefaultType"] = DEFAULT, inference_id: Union[str, "DefaultType"] = DEFAULT, search_inference_id: Union[str, "DefaultType"] = DEFAULT, + chunking_settings: Union[ + "types.ChunkingSettings", Dict[str, Any], "DefaultType" + ] = DEFAULT, **kwargs: Any, ): if meta is not DEFAULT: @@ -3707,6 +3715,8 @@ def __init__( kwargs["inference_id"] = inference_id if search_inference_id is not DEFAULT: kwargs["search_inference_id"] = search_inference_id + if chunking_settings is not DEFAULT: + kwargs["chunking_settings"] = chunking_settings super().__init__(*args, **kwargs) diff --git a/elasticsearch/dsl/query.py b/elasticsearch/dsl/query.py index 1282d3b02..06be2f7fb 100644 --- a/elasticsearch/dsl/query.py +++ b/elasticsearch/dsl/query.py @@ -1382,7 +1382,49 @@ def __init__( min_term_freq: Union[int, "DefaultType"] = DEFAULT, min_word_length: Union[int, "DefaultType"] = DEFAULT, routing: Union[str, "DefaultType"] = DEFAULT, - stop_words: Union[str, Sequence[str], "DefaultType"] = DEFAULT, + stop_words: Union[ + Literal[ + "_arabic_", + "_armenian_", + "_basque_", + "_bengali_", + "_brazilian_", + "_bulgarian_", + "_catalan_", + "_cjk_", + "_czech_", + "_danish_", + "_dutch_", + "_english_", + "_estonian_", + "_finnish_", + "_french_", + "_galician_", + "_german_", + "_greek_", + "_hindi_", + "_hungarian_", + "_indonesian_", + "_irish_", + "_italian_", + "_latvian_", + "_lithuanian_", + "_norwegian_", + "_persian_", + "_portuguese_", + "_romanian_", + "_russian_", + "_serbian_", + "_sorani_", + "_spanish_", + "_swedish_", + "_thai_", + "_turkish_", + "_none_", + ], + Sequence[str], + "DefaultType", + ] = DEFAULT, unlike: Union[ Union[str, "types.LikeDocument"], Sequence[Union[str, "types.LikeDocument"]], diff --git a/elasticsearch/dsl/types.py b/elasticsearch/dsl/types.py index 6dc9f09df..e6e19e410 100644 --- a/elasticsearch/dsl/types.py +++ b/elasticsearch/dsl/types.py @@ -142,6 +142,48 @@ def __init__( super().__init__(kwargs) +class ChunkingSettings(AttrDict[Any]): + """ + :arg strategy: (required) The chunking strategy: `sentence` or `word`. + Defaults to `sentence` if omitted. + :arg max_chunk_size: (required) The maximum size of a chunk in words. + This value cannot be higher than `300` or lower than `20` (for + `sentence` strategy) or `10` (for `word` strategy). Defaults to + `250` if omitted. + :arg overlap: The number of overlapping words for chunks. It is + applicable only to a `word` chunking strategy. This value cannot + be higher than half the `max_chunk_size` value. Defaults to `100` + if omitted. + :arg sentence_overlap: The number of overlapping sentences for chunks. + It is applicable only for a `sentence` chunking strategy. It can + be either `1` or `0`. Defaults to `1` if omitted. + """ + + strategy: Union[str, DefaultType] + max_chunk_size: Union[int, DefaultType] + overlap: Union[int, DefaultType] + sentence_overlap: Union[int, DefaultType] + + def __init__( + self, + *, + strategy: Union[str, DefaultType] = DEFAULT, + max_chunk_size: Union[int, DefaultType] = DEFAULT, + overlap: Union[int, DefaultType] = DEFAULT, + sentence_overlap: Union[int, DefaultType] = DEFAULT, + **kwargs: Any, + ): + if strategy is not DEFAULT: + kwargs["strategy"] = strategy + if max_chunk_size is not DEFAULT: + kwargs["max_chunk_size"] = max_chunk_size + if overlap is not DEFAULT: + kwargs["overlap"] = overlap + if sentence_overlap is not DEFAULT: + kwargs["sentence_overlap"] = sentence_overlap + super().__init__(kwargs) + + class ClassificationInferenceOptions(AttrDict[Any]): """ :arg num_top_classes: Specifies the number of top class predictions to @@ -1561,11 +1603,7 @@ class InnerHits(AttrDict[Any]): DefaultType, ] seq_no_primary_term: Union[bool, DefaultType] - fields: Union[ - Union[str, InstrumentedField], - Sequence[Union[str, InstrumentedField]], - DefaultType, - ] + fields: Union[Sequence[Union[str, InstrumentedField]], DefaultType] sort: Union[ Union[Union[str, InstrumentedField], "SortOptions"], Sequence[Union[Union[str, InstrumentedField], "SortOptions"]], @@ -1600,11 +1638,7 @@ def __init__( DefaultType, ] = DEFAULT, seq_no_primary_term: Union[bool, DefaultType] = DEFAULT, - fields: Union[ - Union[str, InstrumentedField], - Sequence[Union[str, InstrumentedField]], - DefaultType, - ] = DEFAULT, + fields: Union[Sequence[Union[str, InstrumentedField]], DefaultType] = DEFAULT, sort: Union[ Union[Union[str, InstrumentedField], "SortOptions"], Sequence[Union[Union[str, InstrumentedField], "SortOptions"]],