forked from Dammy159753/automatic_english_scoring
-
Notifications
You must be signed in to change notification settings - Fork 0
/
highlight_words.py
167 lines (151 loc) · 8.11 KB
/
highlight_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Author : dengyu
# @Time : 2019/06/15
import operator
import codecs
import operator
import re
import itertools
class AdvancedWords:
def __init__(self):
#self.highlight_word_path = r"./data/high_score_new.txt"
#self.samewords_path = r"./data/samewords.txt"
self.senior_highlight_word_path = r"./data/highword_data/senior_high_vocab.txt"
self.senior_samewords_path = r"./data/highword_data/senior_samewords.txt"
self.junior_highlight_word_path = r"./data/highword_data/junior_high_vocab.txt"
self.junior_samewords_path = r"./data/highword_data/junior_samewords.txt"
self.senior_set = ["高一", "高二", "高三"]
self.junior_set = ["初一", "初二", "初三", "初四"]
def load_dictionary(self):
junior_highlight_list = list()
junior_synonyms = list()
senior_highlight_list = list()
senior_synonyms = list()
with codecs.open(self.senior_highlight_word_path , mode="r", encoding='utf-8') as f1:
for line in f1.readlines():
line = line.strip()
senior_highlight_list.append(line)
with codecs.open(self.senior_samewords_path, mode="r", encoding='utf_8') as f2:
for line in f2:
senior_synonyms.append(line.strip().split(','))
with codecs.open(self.junior_highlight_word_path , mode="r", encoding='utf-8') as f1:
for line in f1.readlines():
line = line.strip()
junior_highlight_list.append(line)
with codecs.open(self.junior_samewords_path, mode="r", encoding='utf_8') as f2:
for line in f2:
junior_synonyms.append(line.strip().split(','))
return senior_highlight_list, senior_synonyms, junior_highlight_list, senior_synonyms
def remove_synonyms(self, content, grade):
self.senior_highlight_list, self.senior_synonyms, self.junior_highlight_list, self.junior_synonyms = self.load_dictionary()
if grade in self.senior_set:
highscore_words = list() # 文中所有高亮词的集合
syn_words = list() # 在同义词表中的词组集合
unsyn_words = list() # 不在同义词表中的词组集合
remove_syn = list() # 去掉重复同义词的词组列表
new_highlight = list() # 最终生成的新的高亮词表
for word in self.senior_highlight_list:
if re.findall(r'(?<= )'+word+r'(?![a-zA-Z])', content):
highscore_words.append(word)
#print("highscore_words:", highscore_words)
for syn in self.senior_synonyms:
for phrase in highscore_words:
if phrase in syn:
syn_words.append(phrase)
# Handling duplicates in the synonym table
temp = list()
for word in syn_words:
if word in syn:
temp.append(word)
if operator.ne(len(temp), 0):
remove_syn.append(temp[0])
remove_syn = list(set(remove_syn))
unsyn_words = [l for l in highscore_words if l not in syn_words]
senior_new_highlight = unsyn_words + remove_syn
return senior_new_highlight
elif grade in self.junior_set:
highscore_words = list() # 文中所有高亮词的集合
syn_words = list() # 在同义词表中的词组集合
unsyn_words = list() # 不在同义词表中的词组集合
remove_syn = list() # 去掉重复同义词的词组列表
new_highlight = list() # 最终生成的新的高亮词表
for word in self.junior_highlight_list:
if re.findall(r'(?<= )' + word + r'(?![a-zA-Z])', content):
highscore_words.append(word)
#print("highscore_words:", highscore_words)
for syn in self.junior_synonyms:
for phrase in highscore_words:
if phrase in syn:
syn_words.append(phrase)
# Handling duplicates in the synonym table
temp = list()
for word in syn_words:
if word in syn:
temp.append(word)
if operator.ne(len(temp), 0):
remove_syn.append(temp[0])
remove_syn = list(set(remove_syn))
unsyn_words = [l for l in highscore_words if l not in syn_words]
junior_new_highlight = unsyn_words + remove_syn
return junior_new_highlight
else:
return []
def find_highlight_site(self, content, grade):
# self.highlight_list, self.synonyms = self.load_dictionary()
new_highlight = self.remove_synonyms(content, grade)
repeat_list = list() # Duplicate word list
for m in range(len(new_highlight)):
flag = new_highlight[m]
for n in range(m + 1, len(new_highlight)):
target = new_highlight[n]
if flag in target or target in flag:
# repeat = "".join([flag[i] for i in range(len(flag)) if flag[i] == target[i]])
repeat = self.getNumofCommonSubStr(target, flag)
if flag == repeat:
repeat_list.append(new_highlight[m])
break
elif target == repeat:
repeat_list.append(new_highlight[n])
break
new_list = list() # New list of highlighted words
for every in new_highlight:
if every not in repeat_list:
new_list.append(every)
highlight_site = list()
high_num = 0
for high in new_list:
index = re.search(r'(?<= )'+ high + r'(?!=[a-zA-Z])', content) # Find the position of the highlighted word in the essay
position = list(index.span())
zip_list = dict(zip(high, position))
highlight_site.append([high, position])
# highlist_sort = highlight_res.sort(key=lambda elem:elem[1])
high_num += 1
if operator.ge(high_num, 10):
high_num = 10
elif operator.lt(high_num, 10):
high_num = high_num
highlight_site.sort(key=lambda x: int(x[1][0]))
return highlight_site, high_num
# Get the largest public string
def getNumofCommonSubStr(self, str1, str2):
lstr1 = len(str1)
lstr2 = len(str2)
# Occupy space
record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)]
maxNum = 0
p = 0
for i in range(lstr1):
for j in range(lstr2):
if str1[i] == str2[j]:
record[i + 1][j + 1] = record[i][j] + 1
if record[i + 1][j + 1] > maxNum:
maxNum = record[i + 1][j + 1]
p = i + 1
return str1[p - maxNum:p]
if __name__ == "__main__":
#AD = AdvancedWords()
content = "I be sick in bed take one's as just time eat. However I like take care ice-cream for dinner every much, most but is ready for my mother take care of doesn't want me to eat them, after she is filled with thinks they are as just not healthy a lot. I just do don't like vegetables, especially carrots. And I am made from like milk, too Because I don't want to be fat. I is made from sports very much . I have five rolleyballs and two basketballs . My favorite sport is volley ball . I think it's easy and fun. A fter class. I play rolleyball with my friends. I be good at basket ball, but I don't play it. I only watch it on TV. Soccer is difficult for me , So I don't play it. If you want to benealthy . You can eat healthy food and play sports now."
grade = '初二'
#highlight_res, high_num = AD.find_highlight_site(content, grade)
#print("highlight_site:{} , high_num:{}".format(highlight_res, high_num))