-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
129 lines (110 loc) · 4.67 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
index.py
Jin Zhao, Xiaojing Yan, Kun Li, Erik Andersen
builds the elasticsearch index
"""
import json
import re
import string
import time
import shelve
import os
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Index, Document, Text, Keyword, Integer
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl.analysis import tokenizer, analyzer, token_filter, char_filter
from elasticsearch_dsl.query import MultiMatch, Match
# Connect to local host server
connections.create_connection(hosts=['127.0.0.1'])
# Create elasticsearch object
es = Elasticsearch()
# chn_analyzer = analyzer()
list_analyzer = analyzer(name_or_instance='custom', tokenizer='standard')
translate_analyzer = analyzer('translate_analyzer', tokenizer='standard', filter=['stop', 'stemmer', 'lowercase'])
english_analyzer = analyzer('english_analyzer', tokenizer='standard', filter=['stemmer', 'lowercase'])
pinyin_analyzer = analyzer('pinyin_analyzer', tokenizer='standard', filter=['asciifolding', 'lowercase'])
class Idiom(Document):
name = Text()
english = Text(analyzer=english_analyzer)
afterword = Text()
riddle = Text()
source = Text()
story = Text()
synonym = Text()
antonym = Text()
desc_translation = Text(analyzer=translate_analyzer)
source_translation = Text(analyzer=translate_analyzer)
story_translation = Text(analyzer=translate_analyzer)
usage_translation = Text(analyzer=translate_analyzer)
# description = Text()
desc_segmentation = Text()
story_segmentation = Text()
source_segmentation = Text()
usage_segmentation = Text()
pinyin = Text(analyzer=pinyin_analyzer)
zodiac = Text()
difficulty = Text()
sentiment = Text()
char_num = Integer()
# synonym = Text(analyzer=list_analyzer)
# override the Document save method to include subclass field definitions
def save(self, *args, **kwargs):
return super(Idiom, self).save(*args, **kwargs)
# Populate the index
def buildIndex():
idiom_index = Index('idioms_search')
if idiom_index.exists():
idiom_index.delete()
idiom_index.document(Idiom)
idiom_index.create()
# get json object movies
with open('chengyu_addedfeatures.json', 'r', encoding='utf-8') as data_file:
idioms = json.load(data_file)
size = len(idioms)
with open('translations.json', 'r', encoding='utf-8') as translation_file:
translations = json.load(translation_file)
def actions():
for mid in range(1, size + 1):
pinyin_segmentation = idioms[str(mid)]['Pinyin_segmented']
segmentation_string = " ".join(pinyin_segmentation)
animal = idioms[str(mid)]['Animal']
zodiac = ", ".join(animal)
english = idioms[str(mid)]['English']
idioms[str(mid)]['English'] = english.rstrip("\"")
# print(segmentation_string)
yield {
"_index": "idioms_search",
"_type": 'doc',
"_id": mid,
"name": idioms[str(mid)]['Name'],
"english": idioms[str(mid)]['English'],
"afterword": idioms[str(mid)]['Afterword'],
"riddle": idioms[str(mid)]['Riddle'],
"source": idioms[str(mid)]['Source'],
"story": idioms[str(mid)]['Story'],
"synonym": idioms[str(mid)]['Synonym'],
"antonym": idioms[str(mid)]['Antonym'],
"desc_translation": idioms[str(mid)]['Description_Translations'],
"source_translation": idioms[str(mid)]['Source_Translations'],
"story_translation": idioms[str(mid)]['Story_Translations'],
"usage_translation": idioms[str(mid)]['Usage_Translations'],
"desc_segmentation": translations[str(mid)]['Description_Segmentation'],
"source_segmentation": translations[str(mid)]['Source_Segmentation'],
"story_segmentation": translations[str(mid)]['Story_Segmentation'],
"usage_segmentation": translations[str(mid)]['Usage_Segmentation'],
"pinyin": segmentation_string,
"zodiac": zodiac,
"sentiment": idioms[str(mid)]['Sentiment'],
"difficulty": idioms[str(mid)]['Difficulty'],
"char_num": idioms[str(mid)]['Char_num'],
# "synonym": idioms[str(mid)]['Synonym']
}
helpers.bulk(es, actions())
def main():
start_time = time.time()
buildIndex()
print("=*=*= Built index in {} seconds =*=*=".format(time.time() - start_time))
if __name__ == '__main__':
main()
print(connections.get_connection().cluster.health())