/
preprocess.py
251 lines (212 loc) · 8.59 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# -*- coding: utf-8 -*-
"""
Functions that modify raw text *in-place*, replacing contractions, URLs, emails,
phone numbers, and currency symbols with standardized forms. These should be
applied before processing by spaCy, but be warned: preprocessing may affect
the interpretation of the text -- and spaCy's processing of it.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import re
import unicodedata
from . import constants
def normalize_unicode(text, form="NFC"):
"""
Normalize unicode characters in ``text`` into canonical forms.
Args:
text (str): Raw text.
form ({"NFC", "NFD", "NFKC", "NFKD"}): Form of normalization applied to
unicode characters. For example, an "e" with accute accent "´" can be
written as "e´" (canonical decomposition, "NFD") or "é" (canonical
composition, "NFC"). Unicode can be normalized to NFC form
without any change in meaning, so it's usually a safe bet. If "NFKC",
additional normalizations are applied that can change characters' meanings,
e.g. ellipsis characters are replaced with three periods.
See Also:
https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
"""
return unicodedata.normalize(form, text)
def normalize_whitespace(text):
"""
Given ``text``, replace one or more spacings with a single space, and one
or more linebreaks with a single newline. Also strip leading/trailing whitespace.
"""
return constants.RE_NONBREAKING_SPACE.sub(
" ", constants.RE_LINEBREAK.sub(r"\n", text)
).strip()
def unpack_contractions(text):
"""
Replace *English* contractions in ``text`` with their unshortened forms.
N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
so are left as-is.
"""
# standard
text = re.sub(
r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't",
r"\1\2 not",
text,
)
text = re.sub(
r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
r"\1\2 will",
text,
)
text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
text = re.sub(
r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve",
r"\1\2 have",
text,
)
# non-standard
text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
return text
def replace_urls(text, replace_with="*URL*"):
"""Replace all URLs in ``text`` with ``replace_with``."""
return constants.RE_URL.sub(
replace_with, constants.RE_SHORT_URL.sub(replace_with, text)
)
def replace_emails(text, replace_with="*EMAIL*"):
"""Replace all emails in ``text`` with ``replace_with``."""
return constants.RE_EMAIL.sub(replace_with, text)
def replace_phone_numbers(text, replace_with="*PHONE*"):
"""Replace all phone numbers in ``text`` with ``replace_with``."""
return constants.RE_PHONE.sub(replace_with, text)
def replace_numbers(text, replace_with="*NUMBER*"):
"""Replace all numbers in ``text`` with ``replace_with``."""
return constants.RE_NUMBERS.sub(replace_with, text)
def replace_currency_symbols(text, replace_with=None):
"""
Replace all currency symbols in ``text`` with string specified by ``replace_with``.
Args:
text (str): Raw text.
replace_with (str): If None (default), replace symbols with
their standard 3-letter abbreviations (e.g. "$" with "USD", "£" with "GBP");
otherwise, pass in a string with which to replace all symbols
(e.g. "*CURRENCY*").
Returns:
str
"""
if replace_with is None:
for k, v in constants.CURRENCIES.items():
text = text.replace(k, v)
return text
else:
return constants.RE_CURRENCY.sub(replace_with, text)
def remove_punct(text, marks=None):
"""
Remove punctuation from ``text`` by replacing all instances of ``marks``
with whitespace.
Args:
text (str): Raw text.
marks (str): If specified, remove only the characters in this string,
e.g. ``marks=",;:"`` removes commas, semi-colons, and colons.
Otherwise, all punctuation marks are removed.
Returns:
str
Note:
When ``marks=None``, Python's built-in :meth:`str.translate()` is
used to remove punctuation; otherwise, a regular expression is used
instead. The former's performance is about 5-10x faster.
"""
if marks:
return re.sub("[{}]+".format(re.escape(marks)), " ", text, flags=re.UNICODE)
else:
return text.translate(constants.PUNCT_TRANSLATE_UNICODE.data)
def remove_accents(text, method="unicode"):
"""
Remove accents from any accented unicode characters in ``text``, either by
transforming them into ascii equivalents or removing them entirely.
Args:
text (str): Raw text.
method ({"unicode", "ascii"}): If "unicode", remove accented char
for any unicode symbol with a direct ASCII equivalent; if "ascii",
remove accented char for any unicode symbol.
Note: The "ascii" method is notably faster than "unicode", but less good.
Returns:
str
Raises:
ValueError: If ``method`` is not in {"unicode", "ascii"}.
"""
if method == "unicode":
return "".join(
c
for c in unicodedata.normalize("NFKD", text)
if not unicodedata.combining(c)
)
elif method == "ascii":
return (
unicodedata.normalize("NFKD", text)
.encode("ascii", errors="ignore")
.decode("ascii")
)
else:
msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
raise ValueError(msg)
def preprocess_text(
text,
normalize_unicode=False,
lowercase=False,
no_urls=False,
no_emails=False,
no_phone_numbers=False,
no_numbers=False,
no_currency_symbols=False,
no_punct=False,
no_contractions=False,
no_accents=False,
):
"""
Normalize various aspects of raw text before parsing it with spaCy.
A convenience function for applying all other preprocessing functions in one go.
Args:
text (str): Raw text to preprocess.
normalize_unicode (bool): If True, normalize unicode characters in text
into canonical form.
lowercase (bool): If True, all text is lower-cased
no_urls (bool): If True, replace all URL strings with "*URL*"
no_emails (bool): If True, replace all email strings with "*EMAIL*"
no_phone_numbers (bool): If True, replace all phone number strings
with "*PHONE*"
no_numbers (bool): If True, replace all number-like strings
with "*NUMBER*"
no_currency_symbols (bool): If True, replace all currency symbols
with their standard 3-letter abbreviations
no_punct (bool): If True, remove all punctuation (replace with
empty string)
no_contractions (bool): If True, replace *English* contractions
with their unshortened forms
no_accents (bool): If True, replace all accented characters
with unaccented versions
Returns:
str: input ``text`` processed according to function args
Warning:
These changes may negatively affect subsequent NLP analysis performed
on the text, so choose carefully, and preprocess at your own risk!
"""
if normalize_unicode is True:
text = normalize_unicode(text, form="NFC")
if no_urls is True:
text = replace_urls(text)
if no_emails is True:
text = replace_emails(text)
if no_phone_numbers is True:
text = replace_phone_numbers(text)
if no_numbers is True:
text = replace_numbers(text)
if no_currency_symbols is True:
text = replace_currency_symbols(text)
if no_contractions is True:
text = unpack_contractions(text)
if no_accents is True:
text = remove_accents(text, method="unicode")
if no_punct is True:
text = remove_punct(text)
if lowercase is True:
text = text.lower()
# always normalize whitespace; treat linebreaks separately from spacing
text = normalize_whitespace(text)
return text