-
Notifications
You must be signed in to change notification settings - Fork 247
/
text.py
152 lines (133 loc) · 4.92 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module of text utils for NLP package."""
import re
import string
import typing as t
import unicodedata
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
__all__ = [
'break_to_lines_and_trim',
'normalize_text',
'hash_text',
'normalize_samples',
'hash_samples'
]
def break_to_lines_and_trim(s, max_lines: int = 10, min_line_length: int = 50, max_line_length: int = 60):
"""Break a string to lines and trim it to a maximum number of lines.
Parameters
----------
s : str
The string to break.
max_lines : int, default 10
The maximum number of lines to return.
min_line_length : int, default 50
The minimum length of a line.
max_line_length : int, default 60
The maximum length of a line.
"""
separating_delimiters = [' ', '\t', '\n', '\r']
lines = []
for i in range(max_lines): # pylint: disable=unused-variable
if len(s) < max_line_length: # if remaining string is short enough, add it and break
lines.append(s.strip())
break
else: # find the first delimiter from the end of the line
max_line_length = min(max_line_length, len(s)-1)
for j in range(max_line_length, min_line_length-1, -1):
if s[j] in separating_delimiters:
lines.append(s[:j])
s = s[j:].strip()
break
else: # if no delimiter was found, break in the middle of the line
# Check if breaking in the middle of an HTML tag
tag_start = re.search(r'<[^>]*$', s[:max_line_length])
if tag_start:
max_line_length = tag_start.start()
lines.append(s[:max_line_length].strip() + '-')
s = s[max_line_length:].strip()
else: # if the loop ended without breaking, and there is still text left, add an ellipsis
if len(s) > 0:
lines[-1] = lines[-1] + '...'
return '<br>'.join(lines)
def remove_punctuation(text: str) -> str:
"""Remove punctuation characters from a string."""
return text.translate(str.maketrans('', '', string.punctuation))
def normalize_unicode(text: str) -> str:
"""Normalize unicode characters."""
return unicodedata.normalize('NFKC', text)
def remove_stopwords(text: str) -> str:
"""Remove stop words from a string."""
if nltk.download('stopwords', quiet=True):
stop_words = set(stopwords.words('english'))
else:
warnings.warn('nltk stopwords not found, stopwords won\'t be ignored when considering text duplicates.'
' Please check your internet connection.')
return text
if nltk.download('punkt', quiet=True):
tokenize = word_tokenize
else:
tokenize = str.split
words = tokenize(text)
return ' '.join([word for word in words if word.lower() not in stop_words])
def normalize_text(
text_sample: str,
*,
ignore_case: bool = True,
remove_punct: bool = True,
normalize_uni: bool = True,
remove_stops: bool = True,
ignore_whitespace: bool = False
) -> str:
"""Normalize given text sample."""
if ignore_case:
text_sample = text_sample.lower()
if remove_punct:
text_sample = remove_punctuation(text_sample)
if normalize_uni:
text_sample = normalize_unicode(text_sample)
if remove_stops:
text_sample = remove_stopwords(text_sample)
if ignore_whitespace:
text_sample = ''.join(text_sample.split())
return text_sample
def normalize_samples(
text_samples: t.Sequence[str],
*,
ignore_case: bool = True,
remove_punct: bool = True,
normalize_uni: bool = True,
remove_stops: bool = True,
ignore_whitespace: bool = False
) -> t.List[str]:
"""Normalize given sequence of text samples."""
return [
normalize_text(
it,
ignore_case=ignore_case,
remove_punct=remove_punct,
normalize_uni=normalize_uni,
remove_stops=remove_stops,
ignore_whitespace=ignore_whitespace
)
for it in text_samples
]
def hash_text(text: str) -> int:
"""Hash a text sample."""
assert isinstance(text, str)
return hash(text)
def hash_samples(text: t.Sequence[str]) -> t.List[int]:
"""Hash a sequence of text samples."""
assert not isinstance(text, str)
return [hash_text(it) for it in text]