In [1]:
from elasticsearch import Elasticsearch

# 参考:https://www.elastic.co/guide/en/elasticsearch/reference/7.11/analysis-normalizers.html

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [3]:
body = {
    "settings": {
        "analysis": {
            # 参考:https://www.elastic.co/guide/en/elasticsearch/reference/7.11/analysis-charfilters.html
            "char_filter": {
                "quote": {
                    "type": "mapping",
                    "mappings": [
                        "( => ",  # 替换为空白字符
                        ") => ",
                        "（ => ",
                        "） => ",
                        "- => ",
                    ]
                }
            },
            "normalizer": {
                "my_normalizer": {
                    "type": "custom",
                    "char_filter": [
                        "quote"  # 自定义
                    ],
                    # 参考:https://www.elastic.co/guide/en/elasticsearch/reference/7.11/analysis-tokenfilters.html
                    "filter": [
                        "lowercase",  # 内置
                        "cjk_width"  # 内置
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "foo": {
                "type": "keyword",
                # 可作用于:keyword
                "normalizer": "my_normalizer"  # 文本数据进行规范化操作(不进行分词)
            }
        }
    }
}

if es.indices.exists('es0'):
    es.indices.delete('es0')
es.indices.create('es0', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'es0'}

In [4]:
es.index(index='es0',
         id=0,
         body={"foo": "HPV42型低危型(定量)"})

{'_index': 'es0',
 '_type': '_doc',
 '_id': '0',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [8]:
es.search(index='es0', body={
    "query": {
        "term": {
            "foo": "HPV-42型低危型定量"  # 查询内容不进行分词
        }
    }
})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.2876821,
  'hits': [{'_index': 'es0',
    '_type': '_doc',
    '_id': '0',
    '_score': 0.2876821,
    '_source': {'foo': 'HPV42型低危型(定量)'}}]}}

In [6]:
es.search(index='es0', body={
    "query": {
        "term": {
            "foo": "HPV-42型低危型（定量）"  # 查询内容不进行分词
        }
    }
})

{'took': 0,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [7]:
analysis = es.indices.analyze(index="es0", body={
    "normalizer": "my_normalizer",
    "text": "HPV-42型低危型（定量）"
})

# 查看过滤后结果
tokens = [token_info['token'] for token_info in analysis['tokens']]
tokens

['hpv42型低危型定量']

In [None]:
es.indices.delete('es0')