/
text.py
155 lines (134 loc) · 5.22 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import re, string
from difflib import SequenceMatcher, _calculate_ratio
def full_split(text, regex):
"""
Split the text by the regex, keeping all parts.
The parts should re-join back into the original text.
>>> list(full_split('word', re.compile('&.*?')))
['word']
"""
while text:
m = regex.search(text)
if not m:
yield text
break
left = text[:m.start()]
middle = text[m.start():m.end()]
right = text[m.end():]
if left:
yield left
if middle:
yield middle
text = right
def multi_split(text, regexes):
"""
Split the text by the given regexes, in priority order.
Make sure that the regex is parenthesized so that matches are returned in
re.split().
Splitting on a single regex works like normal split.
>>> '|'.join(multi_split('one two three', [r'\w+']))
'one| |two| |three'
Splitting on digits first separates the digits from their word
>>> '|'.join(multi_split('one234five 678', [r'\d+', r'\w+']))
'one|234|five| |678'
Splitting on words first keeps the word with digits intact.
>>> '|'.join(multi_split('one234five 678', [r'\w+', r'\d+']))
'one234five| |678'
"""
def make_regex(s):
return re.compile(s) if isinstance(s, basestring) else s
regexes = [make_regex(r) for r in regexes]
# Run the list of pieces through the regex split, splitting it into more
# pieces. Once a piece has been matched, add it to finished_pieces and
# don't split it again. The pieces should always join back together to form
# the original text.
piece_list = [text]
finished_pieces = set()
def apply_re(regex, piece_list):
for piece in piece_list:
if piece in finished_pieces:
yield piece
continue
for s in full_split(piece, regex):
if regex.match(s):
finished_pieces.add(s)
if s:
yield s
for regex in regexes:
piece_list = list(apply_re(regex, piece_list))
assert ''.join(piece_list) == text
return piece_list
# A special case list of contractions and other words that should be grouped.
# Case insensitive.
_word_list = [
"i'm", "i'll", "i'd", "i've", "you're", "you'll", "you'd", "you've",
"he's", "he'll", "he'd", "she's", "she'll", "she'd", "it's", "it'll",
"it'd", "we're", "we'll", "we'd", "we've", "they're", "they'll", "they'd",
"they've", "there's", "there'll", "there'd", "that's", "that'll", "that'd",
"ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't",
"hadn't", "hasn't", "isn't", "mustn't", "needn't", "shouldn't", "wasn't",
"weren't", "won't", "wouldn't",
]
_word_split_regexes = [
# HTML Entities
re.compile(r'&(\w+|#\d+);', re.IGNORECASE),
# Special cases.
re.compile('%s' % '|'.join(re.escape(c) for c in _word_list), re.IGNORECASE),
# Simplified phone number pattern. Any dash-separated list of digits.
re.compile(r'\d+(-\d+)+'),
# Simplified date pattern. Any slash-separated list of digits.
re.compile(r'\d+(/\d+)+'),
# Numbers
re.compile(r'\d+'),
# Punctuation
re.compile(r'[%s]' % re.escape(string.punctuation)),
# Words
re.compile(r'\w+', re.UNICODE),
# Anything else that isn't whitespace
re.compile(r'[^\s]+', re.UNICODE),
]
def split_text(text):
return multi_split(text, _word_split_regexes)
_stopwords = 'a an and as at by for if in it of or so the to'
_stopwords = set(_stopwords.strip().lower().split())
def is_text_junk(word):
"""Treat whitespace and stopwords as junk for text matching."""
return word.isspace() or word.lower() in _stopwords
class WordMatcher(SequenceMatcher):
"""
WordMatcher is a SequenceMatcher that can measure the similarity of
sequences of words based on the total length of matching words.
"""
def __init__(self, isjunk=is_text_junk, a=None, b=None):
if a is None:
a = []
if b is None:
b = []
SequenceMatcher.__init__(self, isjunk, a, b)
def text_ratio(self):
"""Return a measure of the sequences' word similarity (float in [0,1]).
Each word has weight equal to its length for this measure
>>> m = WordMatcher(a=['abcdef', '12'], b=['abcdef', '34']) # 3/4 of the text is the same
>>> '%.3f' % m.ratio() # normal ratio fails
'0.500'
>>> '%.3f' % m.text_ratio() # text ratio is accurate
'0.750'
"""
return _calculate_ratio(
self.match_length(),
self._text_length(self.a) + self._text_length(self.b),
)
def match_length(self):
""" Find the total length of all words that match between the two sequences."""
length = 0
for match in self.get_matching_blocks():
a, b, size = match
length += self._text_length(self.a[a:a+size])
return length
def _text_length(self, word_sequence):
# Find the length of non-junk text in the sequence.
return sum(self._word_length(word) for word in word_sequence)
def _word_length(self, word):
if self.isjunk and self.isjunk(word):
return 0
return len(word)