In [11]:
from elasticsearch import Elasticsearch

In [12]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

### 同义词配置文件参考
```text
母亲 , 妈妈 , mother
父亲 , 爸爸 , father
老虎 , 狮子 => 动物
西红柿 , 黄瓜, 香蕉 => 水果, 蔬菜
```

In [13]:
body = {
    "settings": {
            "analysis": {
                "analyzer": {
                    "synonym_analyzer": {
                        "tokenizer": "ik_max_word",
                        "filter": ["my_synonyms"]
                    }
                },
                "filter": {
                    "my_synonyms": {
                        "type": "synonym",
                        # 逗号分隔
                        "synonyms": [
                            # 等价同义词,词汇之间是彼此的同义词词 
                            # 母亲是妈妈的同义词词,妈妈也是母亲的同义词
                            "母亲 , 妈妈 , mother",
                            "父亲 , 爸爸 , father",
                            
                            # 单向同义词
                            # 动物是老虎的同近义词,老虎不是动物的同义词
                            "老虎 , 狮子 => 动物",
                            "西红柿 , 黄瓜, 香蕉 => 水果, 蔬菜"
                        ],
                        # 同义词配置文件(容器内elasticsearch config目录中,即xxx/elasticsearch/config/synonyms/synonym0.txt)
                        
                        # You can use the reload search analyzers API to pick up changes to synonym files used in the synonym_graph or synonym token filter of a search analyzer. 
                        # To be eligible, the token filter must have an updateable flag of true and only be used in search analyzers.
                        # 参考:setting参数_同近义词过滤_search_analyzer_demo.ipynb
                        "updateable": True  # 索引时使用同义词不能指定updatealbe为True,仅用于搜索时使用同义词
                    }
                }
            }
    }
}

if es.indices.exists('es0'):
    es.indices.delete('es0')
es.indices.create('es0', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'es0'}

In [14]:
# 同近义词占据相同的位置
print(es.indices.analyze(index="es0", body={
    "analyzer": "synonym_analyzer",
    "text": "我爱妈妈"}), end='\n\n')  # 母亲,mother也被查询

print(es.indices.analyze(index="es0", body={
    "analyzer": "synonym_analyzer",
    "text": "我爱mother"}), end='\n\n')  # 母亲,妈妈也被查询

{'tokens': [{'token': '我', 'start_offset': 0, 'end_offset': 1, 'type': 'CN_CHAR', 'position': 0}, {'token': '爱', 'start_offset': 1, 'end_offset': 2, 'type': 'CN_CHAR', 'position': 1}, {'token': '妈妈', 'start_offset': 2, 'end_offset': 4, 'type': 'CN_WORD', 'position': 2}, {'token': '母亲', 'start_offset': 2, 'end_offset': 4, 'type': 'SYNONYM', 'position': 2}, {'token': 'mother', 'start_offset': 2, 'end_offset': 4, 'type': 'SYNONYM', 'position': 2}]}

{'tokens': [{'token': '我', 'start_offset': 0, 'end_offset': 1, 'type': 'CN_CHAR', 'position': 0}, {'token': '爱', 'start_offset': 1, 'end_offset': 2, 'type': 'CN_CHAR', 'position': 1}, {'token': 'mother', 'start_offset': 2, 'end_offset': 8, 'type': 'ENGLISH', 'position': 2}, {'token': '母亲', 'start_offset': 2, 'end_offset': 8, 'type': 'SYNONYM', 'position': 2}, {'token': '妈妈', 'start_offset': 2, 'end_offset': 8, 'type': 'SYNONYM', 'position': 2}]}



In [15]:
print(es.indices.analyze(index="es0", body={
    "analyzer": "synonym_analyzer",
    "text": "我喜欢老虎"}), end='\n\n')  # 动物被查询

print(es.indices.analyze(index="es0", body={
    "analyzer": "synonym_analyzer",
    "text": "我喜欢动物"}), end='\n\n')  # 老虎,狮子不被查询(单向)

print(es.indices.analyze(index="es0", body={
    "analyzer": "synonym_analyzer",
    "text": "我喜欢西红柿"}), end='\n\n')  # 水果,蔬菜被查询

print(es.indices.analyze(index="es0", body={
    "analyzer": "synonym_analyzer",
    "text": "我喜欢水果"}), end='\n\n')  # 西红柿,黄瓜,香蕉不被查询

{'tokens': [{'token': '我', 'start_offset': 0, 'end_offset': 1, 'type': 'CN_CHAR', 'position': 0}, {'token': '喜欢', 'start_offset': 1, 'end_offset': 3, 'type': 'CN_WORD', 'position': 1}, {'token': '动物', 'start_offset': 3, 'end_offset': 5, 'type': 'SYNONYM', 'position': 2}]}

{'tokens': [{'token': '我', 'start_offset': 0, 'end_offset': 1, 'type': 'CN_CHAR', 'position': 0}, {'token': '喜欢', 'start_offset': 1, 'end_offset': 3, 'type': 'CN_WORD', 'position': 1}, {'token': '动物', 'start_offset': 3, 'end_offset': 5, 'type': 'CN_WORD', 'position': 2}]}

{'tokens': [{'token': '我', 'start_offset': 0, 'end_offset': 1, 'type': 'CN_CHAR', 'position': 0}, {'token': '喜欢', 'start_offset': 1, 'end_offset': 3, 'type': 'CN_WORD', 'position': 1}, {'token': '水果', 'start_offset': 3, 'end_offset': 6, 'type': 'SYNONYM', 'position': 2}, {'token': '蔬菜', 'start_offset': 3, 'end_offset': 6, 'type': 'SYNONYM', 'position': 2}]}

{'tokens': [{'token': '我', 'start_offset': 0, 'end_offset': 1, 'type': 'CN_CHAR', 'position