In [264]:
from elasticsearch import Elasticsearch
# 参考:https://www.elastic.co/guide/en/elasticsearch/reference/7.11/analysis-custom-analyzer.html

In [265]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

body = {
    "settings": {
        "analysis": {
            "analyzer": {
                "my_custom_analyzer": {
                    "type": "custom",
                    # A built-in or customised tokenizer. (Required)
                    "tokenizer": "ik_max_word",
                    # Character filters are used to preprocess the stream of characters before it is passed to the tokenizer.
                    # 参考:https://www.elastic.co/guide/en/elasticsearch/reference/7.11/analysis-charfilters.html
                    "char_filter": [
                        "emoticons"  # 自定义
                    ],
                    # Token filters accept a stream of tokens from a tokenizer and can modify tokens (eg lowercasing), delete tokens (eg remove stopwords) or add tokens (eg synonyms).
                    # 参考:https://www.elastic.co/guide/en/elasticsearch/reference/7.11/analysis-tokenfilters.html
                    "filter": [
                        "lowercase",  # 内置
                        "english_stop"  # 自定义
                    ]
                }
            },
            "char_filter": {
                "emoticons": {
                    "type": "mapping",
                    "mappings": [
                        "III => 3",
                        "iii => 3",
                        "Ⅲ => 3"
                    ]
                }
            },
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "information": {
                "type": "text",
                # Only text fields support the analyzer mapping parameter.
                # 可作用于:text,token-count
                "analyzer": "my_custom_analyzer"
            }
        }
    }
}

if es.indices.exists('es0'):
    es.indices.delete('es0')
es.indices.create('es0', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'es0'}

In [266]:
es.indices.get('es0')

{'es0': {'aliases': {},
  'mappings': {'properties': {'information': {'type': 'text',
     'analyzer': 'my_custom_analyzer'}}},
  'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'number_of_shards': '1',
    'provided_name': 'es0',
    'creation_date': '1697443924867',
    'analysis': {'filter': {'english_stop': {'type': 'stop',
       'stopwords': '_english_'}},
     'analyzer': {'my_custom_analyzer': {'filter': ['lowercase',
        'english_stop'],
       'char_filter': ['emoticons'],
       'type': 'custom',
       'tokenizer': 'ik_max_word'}},
     'char_filter': {'emoticons': {'type': 'mapping',
       'mappings': ['III => 3', 'iii => 3', 'Ⅲ => 3']}}},
    'number_of_replicas': '1',
    'uuid': 'yrDWd5gCSXiDe0wxdTqxcA',
    'version': {'created': '7110299'}}}}}

In [267]:
es.index(index='es0',
         id=0,
         body={"information": "乳腺BI-RADS2"})

{'_index': 'es0',
 '_type': '_doc',
 '_id': '0',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [268]:
es.index(index='es0',
         id=1,
         body={"information": "乳腺BI-RADS3"})

{'_index': 'es0',
 '_type': '_doc',
 '_id': '1',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

In [270]:
es.search(index='es0', body={
    "query": {
        "match": {
            "information": {
                "query": "乳腺BI-RADSiii"
            }
        }
    }
})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 1.9332589,
  'hits': [{'_index': 'es0',
    '_type': '_doc',
    '_id': '1',
    '_score': 1.9332589,
    '_source': {'information': '乳腺BI-RADS3'}},
   {'_index': 'es0',
    '_type': '_doc',
    '_id': '0',
    '_score': 0.5469647,
    '_source': {'information': '乳腺BI-RADS2'}}]}}

In [271]:
analysis = es.indices.analyze(index="es0", body={
    "analyzer": "my_custom_analyzer",
    "text": "乳腺BI-RADSiii"
})

# 查看分词结果
tokens = [token_info['token'] for token_info in analysis['tokens']]
tokens

['乳腺', 'bi-rads3', 'bi', 'rads', '3']

In [272]:
es.indices.delete('es0')

{'acknowledged': True}