-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
67 lines (62 loc) · 2.38 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
class Tokenizer(object):
"""
Tokenizes and preprocesses the sample's text, returning a
list of tokens.
Largely adapted from Christopher Potts' sentiment tokenizer:
http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py
url pattern adapted from http://stackoverflow.com/a/3809435/5818736
"""
def __init__(self):
self.emoticon_pattern = r'[=:;]-?\s?[\)\(D]'
self.repeated_pattern = re.compile(r'(.)\1{2,}')
self.user_pattern = re.compile(r'@+[\w_]+')
self.hashtag_pattern = re.compile(r'\#+[\w_]+[\w\'_\-]*[\w_]+')
self.url_pattern = re.compile(
r'(https?:\/\/(www\.)?)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.' + \
r'[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
word_pattern = r"""
(?:<[^>]+>) # HTML tags
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_\-']+) # Words
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
self.word_pattern = re.compile(word_pattern,
re.VERBOSE | re.I | re.U)
def tokenize(self, text):
"""
Parameters
----------
text: string to tokenize
Returns
-------
tokens: list of tokens
"""
# remove newlines
try:
text = unicode(text)
except UnicodeDecodeError:
text = str(text).encode('string_escape')
text = unicode(text)
# strip slashes
text = re.sub(r'\\', '', text)
# strip newlines
text = re.sub(r'\\n', '', text)
# map usernames to USERNAME
text = re.sub(self.user_pattern, 'USER', text)
# map hashtags to HASHTAG
text = re.sub(self.hashtag_pattern, 'HASHTAG', text)
# remove emoticons
text = re.sub(self.emoticon_pattern, '', text)
# map sequences of length >= 3 of the same character
# to sequences of length 3 of the character
text = re.sub(self.repeated_pattern, r'\g<1>\g<1>\g<1>', text)
# map urls to URL
text = re.sub(self.url_pattern, 'URL', text)
tokens = self.word_pattern.findall(text)
return tokens