-
Notifications
You must be signed in to change notification settings - Fork 0
/
chengyu_index.py
160 lines (149 loc) · 6.52 KB
/
chengyu_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
'''
What this file does:
chengyu_index.py
reads in the 13279 chengyu in the file
Indexes name, pinyin, and does more detailed analysis on
description, puts all recognized categories into a dictionary - creates json file,
Also segments some sentences using Stanford CoreNLP, and then provides possible translations to be used later on
Jin Zhao, Xiaojing Yan, Kun Li, Erik Andersen
"""
'''
import os, re, json
from nltk.parse import CoreNLPParser
from nltk.corpus import stopwords
chinese_stopwords = set('的了很不没有我你他')
punctuation = set(',《》。?/、【】;:!()»“”«<>*-.+'&"#$%」「~^[][]()\'*$#%&‧‧•—~﹏﹁﹂\"…@`_\=')
to_translate = ['Description', 'Source', 'Story', 'Usage']
def lookup(sentence, dictionary):
"""
looks up all the words in a segmented sentence in the given dictionary
"""
sent_dict = dict()
translations = []
index = 0
for word in sentence:
word_dict = dict()
if word in dictionary.keys():
word_translations = dictionary[word]
translations.extend(word_translations)
word_dict[word] = word_translations
sent_dict[str(index)] = word_translations
else:
translations.append(word)
sent_dict[str(index)] = word
index += 1
return translations, sent_dict
def corpus_from_dict(dictionary, filename):
"""
turns a dictionary into a json file
"""
with open(filename, 'w') as file:
json.dump(dictionary, file, ensure_ascii=False, indent=3)
if __name__ == '__main__':
filename = os.path.join(os.path.curdir, '13279_chengyu.txt')
dict_file = os.path.join(os.path.curdir, 'cedict_1_0_ts_utf-8_mdbg.txt')
f = open(filename)
chengyu_index = dict()
chengyu_english = dict()
# files for index, translations, and dictionary
chengyu_json_file = os.path.join(os.path.curdir, 'chengyu_index_r.json')
translation_json_file = os.path.join(os.path.curdir, 'translations.json')
zh_en_trad_json_file = os.path.join(os.path.curdir, 'zh_en_trad.json')
zh_en_simp_json_file = os.path.join(os.path.curdir, 'zh_en_simp.json')
shelf_file = os.path.join(os.path.curdir, 'chengyu_shelf')
file_content = f.read().split('\n')
all_chengyu = []
chengyu_number = 1
# build index
# start CoreNLP server
chengyu_segmenter = CoreNLPParser('http://localhost:9001')
words = []
dict_f = open(dict_file)
dict_file_content = dict_f.read().split('\n')
all_chinese = []
zh_en_trad_dict = dict()
zh_en_simp_dict = dict()
# go through chinese-english dictionary and extract translations
for line in dict_file_content:
if line.startswith('#') or not line:
continue
# grab necessary parts of the chinese dictionary using regexes
chinese_chars = re.sub(r'\s\[.+', '', line)
trad_simplified = chinese_chars.split()
pinyin = re.sub(r'^[^\[]+\[', '', line)
pinyin = re.sub(r'\].+', '', pinyin)
pinyin = pinyin.strip('\[')
meaning = re.sub(r'^[^/]+/', '', line)
meaning = meaning.strip('/')
definitions = meaning.split('/')
definitions = [d for d in definitions if not d.startswith('variant') and not d.startswith('CL') and not
d.startswith('see ')]
zh_en_trad_dict[trad_simplified[0]] = definitions
if trad_simplified[1] in zh_en_simp_dict.keys():
previous = zh_en_simp_dict[trad_simplified[1]]
previous.extend(definitions)
all_definitions = list(set(previous))
zh_en_simp_dict[trad_simplified[1]] = all_definitions
else:
zh_en_simp_dict[trad_simplified[1]] = definitions
all_chinese.extend(trad_simplified)
# categories that are followed by :
categories = ['歇后语', '灯迷会', '英语', '反义词', '近义词', '用法', '故事', '出处', '拼音']
category_english = ['Afterword', 'Riddle', 'English', 'Antonym', 'Synonym', 'Usage', 'Story', 'Source', 'Pinyin']
category_dict = dict(zip(categories, category_english))
# go through chengyu file
for line in file_content:
if line.startswith('//') or not line:
continue
chengyu_data = dict()
chengyu = line.split('\t')
all_chengyu.append(chengyu[0])
# start with structured data - name, pinyin, description
chengyu_data['Name'] = chengyu[0]
chengyu_data['Pinyin'] = chengyu[1]
# split on : to find range of each category above
split_description = chengyu[2].split(':')
next_category = ""
for s in split_description:
s = s.strip('】')
last_three = s[-3:]
last_two = s[-2:]
if last_two in categories:
s = s.rstrip(last_two)
elif last_three in categories:
s = s.rstrip(last_three)
if next_category:
eng_category = category_dict[next_category]
chengyu_data[eng_category] = s
else:
chengyu_data['Description'] = s
if last_two in categories:
next_category = last_two
elif last_three in categories:
next_category = last_three
# make sure all entries have each of the categories above as keys
for c in category_english:
if c not in chengyu_data:
chengyu_data[c] = ""
translation_help = dict()
# fields that support translation
for field in to_translate:
to_segment = chengyu_data[field]
# substitute out · to help with segmentation
to_segment = re.sub(r'·', ' ', to_segment)
if to_segment:
new_words = list(chengyu_segmenter.tokenize(to_segment))
else:
new_words = []
# add extra fields
translation_help[field + '_Segmentation'] = new_words
translations, sent_dict = lookup(new_words, zh_en_simp_dict)
chengyu_data[field + '_Translations'] = translations
translation_help[field + '_Sentence_Code'] = sent_dict
chengyu_english[chengyu_number] = translation_help
chengyu_index[chengyu_number] = chengyu_data
chengyu_number += 1
# make necessary json files (chengyu index, translation, simplified chinese dictionary)
corpus_from_dict(chengyu_index, chengyu_json_file)
corpus_from_dict(chengyu_english, translation_json_file)
corpus_from_dict(zh_en_simp_dict, zh_en_simp_json_file)