Skip to content

Commit

Permalink
Allow proper definitions of multiplexer and conditional token filters
Browse files Browse the repository at this point in the history
Including propagating the nested filters into the index settings

Fixes #1212
  • Loading branch information
honzakral committed Aug 18, 2019
1 parent e1b4653 commit d77ee3b
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 3 deletions.
64 changes: 62 additions & 2 deletions elasticsearch_dsl/analysis.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import six

from .connections import get_connection
from .utils import AttrDict, DslBase
from .utils import AttrDict, DslBase, merge

__all__ = [
'tokenizer', 'analyzer', 'char_filter', 'token_filter', 'normalizer'
Expand All @@ -18,7 +18,7 @@ def _type_shortcut(cls, name_or_instance, type=None, **kwargs):
if not (type or kwargs):
return cls.get_dsl_class('builtin')(name_or_instance)

return cls.get_dsl_class('custom')(name_or_instance, type or 'custom', **kwargs)
return cls.get_dsl_class(type, 'custom')(name_or_instance, type or 'custom', **kwargs)

class CustomAnalysis(object):
name = 'custom'
Expand Down Expand Up @@ -50,6 +50,13 @@ def get_analysis_definition(self):
if filters:
out['filter'] = filters

# any sub filter definitions like multiplexers etc?
for f in self.filter:
if hasattr(f, 'get_analysis_definition'):
d = f.get_analysis_definition()
if d:
merge(out, d, True)

char_filters = {f._name: f.get_definition()
for f in self.char_filter if hasattr(f, 'get_definition')}
if char_filters:
Expand Down Expand Up @@ -154,6 +161,59 @@ class BuiltinTokenFilter(BuiltinAnalysis, TokenFilter):
class CustomTokenFilter(CustomAnalysis, TokenFilter):
pass

class MultiplexerTokenFilter(CustomTokenFilter):
name = 'multiplexer'

def get_definition(self):
d = super(CustomTokenFilter, self).get_definition()

if 'filters' in d:
d['filters'] = [
# comma delimited string given by user
fs if isinstance(fs, six.string_types) else
# list of strings or TokenFilter objects
', '.join(f.to_dict() if hasattr(f, 'to_dict') else f for f in fs)

for fs in self.filters
]
return d

def get_analysis_definition(self):
if not hasattr(self, 'filters'):
return {}

fs = {}
d = {'filter': fs}
for filters in self.filters:
if isinstance(filters, six.string_types):
continue
fs.update({f._name: f.get_definition()
for f in filters if hasattr(f, 'get_definition')})
return d

class ConditionalTokenFilter(CustomTokenFilter):
name = 'condition'

def get_definition(self):
d = super(CustomTokenFilter, self).get_definition()
if 'filter' in d:
d['filter'] = [
f.to_dict() if hasattr(f, 'to_dict') else f
for f in self.filter
]
return d

def get_analysis_definition(self):
if not hasattr(self, 'filter'):
return {}

return {
'filter': {
f._name: f.get_definition()
for f in self.filter if hasattr(f, 'get_definition')
}
}


class CharFilter(AnalysisBase, DslBase):
_type_name = 'char_filter'
Expand Down
4 changes: 3 additions & 1 deletion elasticsearch_dsl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,12 @@ class DslBase(object):
_param_defs = {}

@classmethod
def get_dsl_class(cls, name):
def get_dsl_class(cls, name, default=None):
try:
return cls._classes[name]
except KeyError:
if default is not None:
return cls._classes[default]
raise UnknownDslObject('DSL class `{}` does not exist in {}.'.format(name, cls._type_name))

def __init__(self, _expand__to_dot=EXPAND__TO_DOT, **params):
Expand Down
162 changes: 162 additions & 0 deletions test_elasticsearch_dsl/test_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# coding: utf-8
from elasticsearch_dsl import analysis

from pytest import raises

def test_analyzer_serializes_as_name():
a = analysis.analyzer('my_analyzer')

Expand All @@ -19,6 +21,166 @@ def test_analyzer_has_definition():
'filter': ["lowercase"],
} == a.get_definition()

def test_simple_multiplexer_filter():
a = analysis.analyzer(
'my_analyzer',
tokenizer='keyword',
filter=[
analysis.token_filter(
'my_multi',
'multiplexer',
filters=['lowercase', 'lowercase, stop']
)
]
)

assert {
"analyzer": {
"my_analyzer": {
"filter": [
"my_multi"
],
"tokenizer": "keyword",
"type": "custom"
}
},
"filter": {
"my_multi": {
"filters": [
"lowercase",
"lowercase, stop"
],
"type": "multiplexer"
}
}
} == a.get_analysis_definition()

def test_multiplexer_with_custom_filter():
a = analysis.analyzer(
'my_analyzer',
tokenizer='keyword',
filter=[
analysis.token_filter(
'my_multi',
'multiplexer',
filters=[
[
analysis.token_filter(
'en',
'snowball',
language='English'
)
],
'lowercase, stop'
]
)
]
)

assert {
"analyzer": {
"my_analyzer": {
"filter": [
"my_multi"
],
"tokenizer": "keyword",
"type": "custom"
}
},
"filter": {
"en": {
"type": "snowball",
"language": "English"
},
"my_multi": {
"filters": [
"en",
"lowercase, stop"
],
"type": "multiplexer"
}
}
} == a.get_analysis_definition()

def test_conditional_token_filter():
a = analysis.analyzer(
'my_cond',
tokenizer=analysis.tokenizer('keyword'),
filter=[
analysis.token_filter(
'testing',
'condition',
script={'source': 'return true'},
filter=[
'lowercase',
analysis.token_filter(
'en',
'snowball',
language='English'
)
]
),
'stop'
]
)

assert {
"analyzer": {
"my_cond": {
"filter": [
"testing",
"stop"
],
"tokenizer": "keyword",
"type": "custom"
}
},
"filter": {
"en": {
"language": "English",
"type": "snowball"
},
"testing": {
"script": {"source": "return true"},
"filter": [
"lowercase",
"en"
],
"type": "condition"
}
}
} == a.get_analysis_definition()

def test_conflicting_nested_filters_cause_error():
a = analysis.analyzer(
'my_cond',
tokenizer=analysis.tokenizer('keyword'),
filter=[
analysis.token_filter(
'en',
'stemmer',
language='english'
),
analysis.token_filter(
'testing',
'condition',
script={'source': 'return true'},
filter=[
'lowercase',
analysis.token_filter(
'en',
'snowball',
language='English'
)
]
)
]
)

with raises(ValueError):
a.get_analysis_definition()


def test_normalizer_serializes_as_name():
n = analysis.normalizer('my_normalizer')

Expand Down

0 comments on commit d77ee3b

Please sign in to comment.