-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
tokenize.py
255 lines (222 loc) · 7.12 KB
/
tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""
Module with tokenize functions
"""
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy3 as pymorphy
from find_similar.calc_models import LanguageNotFoundException
morph = pymorphy.MorphAnalyzer()
PUNCTUATION_SET = {";", ",", ".", "(", ")", "*", "-", ":"}
UNUSEFUL_WORDS = {
"шт",
"уп",
"x",
"х", # одна букава x-английская, вторая х-русская
"mm",
"мм",
"сс",
"cc",
"m",
"м",
"подха", # там есть подх.для
"тип",
"d",
"vz",
# 'мя' # 2-мя, 3-мя, ...
}
STOP_WORDS_NO_LANGUAGE = PUNCTUATION_SET.union(UNUSEFUL_WORDS)
def get_stopwords_from_nltk(language: str):
"""
Get stopwords for specific language
:param language: current text language
"""
try:
stopwords_with_language = stopwords.words(language)
except LookupError:
nltk.download("stopwords")
nltk.download("punkt")
stopwords_with_language = stopwords.words(language)
except OSError as exc:
raise LanguageNotFoundException(language) from exc
return stopwords_with_language
def add_nltk_stopwords(language: str, stop_words=None):
"""
Add stopwords to STOP_WORDS_NO_LANGUAGE
:param language: current text language
:param stop_words: existing stop words
"""
if stop_words is None:
stop_words = STOP_WORDS_NO_LANGUAGE
stopwords_with_language = get_stopwords_from_nltk(language)
stop_words = stop_words.union(stopwords_with_language)
return stop_words
def spacing(text: str, chars: list):
"""
replace chars to space
:param text: Text to spacing
:param chars: Chars to replace
:return: new text without chars with spaces
"""
for char in chars:
text = text.replace(char, " ")
return text
def replacing(text: str, chars: list):
"""
replace chars to empty string
:param text: Text to replace
:param chars: Chars to replace
:return: new text without chars
"""
for char in chars:
text = text.replace(char, "")
return text
def replace_yio(text):
"""
Change russian ё to e
:param text: Text to change
:return: new text without ё with е
"""
return text.replace("ё", "е")
def split_text_and_digits(text):
"""
Split words and digits
:param text: enter text
:return: list of separated texts
"""
regex = r"^\D+[0]\D+$" # so0os
match = re.search(regex, text, re.MULTILINE)
if match:
return [text]
# Проверяем на вольты и амперы 55В -> 55 v
regex = r"\d+[.]?\d?[в|а|В|А|B|A|a]{1}$"
match = re.search(regex, text, re.MULTILINE)
if match:
text = match.group()
chars = "вВB"
for char in chars:
text = text.replace(char, "v")
chars = "аАaA"
for char in chars:
text = text.replace(char, "ah")
# Делим цифры и буквы
regex = r"\D+|\d+"
texts = []
matches = re.finditer(regex, text, re.MULTILINE)
for _, match in enumerate(matches, start=1):
texts.append(match.group())
return texts
def get_normal_form(part_parse):
"""
Get Normal Form
:param part_parse: special object
:return: object normal form
"""
return part_parse.normal_form
class HashebleSet(set):
"""
Special class set with hash to compare and sort two sets
"""
def __hash__(self):
return hash(str(self))
def use_dictionary_multiple(tokens, dictionary):
"""
Use dictionary with multiple compliance
"""
for k, val in dictionary.items():
if k.issubset(tokens):
tokens = (tokens - k).union(val)
return tokens
def remove_part_speech(part_parse, parts=None, dictionary=None):
"""
Remove variable part of speach from word
:param dictionary: default = None.
If you want to replace one words to others you can send the dictionary.
:param part_parse: pymorph2 object
:param parts: set of part of speach
NOUN noun name
ADJF adjective name (full)
VERB verb (personal form)
INFN verb (infinitive)
NUMR numeral
PREP preposition
CONJ conjunction
PRCL particle
:return: text without variable part of speach or None
"""
result = get_normal_form(part_parse)
if dictionary and HashebleSet([result]) in dictionary:
return result
if parts is None:
parts = {"INFN", "VERB"}
for part in parts:
if part in part_parse.tag:
return None
return result
def get_parsed_text(word: str):
"""
Get Parsed Text
:param word: str word
:return: pymorphy2 object
"""
return morph.parse(word)[0]
def tokenize(text: str, language: str, dictionary=None, remove_stopwords=True):
"""
Main function to tokenize text
:param text: Text to tokenize
:param language: language for setting stop-words
:param dictionary: default = None.
If you want to replace one words to others you can send the dictionary.
:param remove_stopwords: default = True. Remove stopwords if True
:return: Tokens
"""
# replace these characters with spaces
punc_to_space = [",", "/", "-", "=", "."]
text = spacing(text, punc_to_space)
# delete these characters (replace with an empty string)
punc_to_delete = ["Ø", "¶", "”"]
text = replacing(text, punc_to_delete)
text = replace_yio(text)
tmp_set = set()
# now we go by individual words
stop_words = add_nltk_stopwords(language)
for word in word_tokenize(text, language=language):
# divide into parts if there are numbers in the word
parts = split_text_and_digits(word)
# then we go in parts
for part in parts:
# check if the verb - remove it
# in the opposite case, we bring it to the normal form
part_parse = get_parsed_text(part)
# we remove the verbs and bring them to the normal form
word_normal_form = remove_part_speech(part_parse, dictionary=dictionary)
if word_normal_form:
# remove stop words
if remove_stopwords:
if word_normal_form not in stop_words:
tmp_set.add(word_normal_form)
else:
tmp_set.add(word_normal_form)
if dictionary:
# use dictionary
tmp_set = use_dictionary_multiple(tmp_set, dictionary)
return tmp_set
def prepare_dictionary(dictionary):
"""
Get special object from simple python dict
:param dictionary: default = None.
If you want to replace one words to others you can send the dictionary.
:return: dictionary of HashebleSet with data
"""
result = {}
for k, val in dictionary.items():
# взять нормальные формы
keys = k.split()
keys = [get_normal_form(get_parsed_text(key)) for key in keys]
new_k = HashebleSet(keys)
values = val.split()
values = [get_normal_form(get_parsed_text(value)) for value in values]
new_v = set(values)
result[new_k] = new_v
return result