-
Notifications
You must be signed in to change notification settings - Fork 2
/
binsearch.py
102 lines (93 loc) · 3.98 KB
/
binsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Spell checker.
Derived from faster0.py; tuned for faster startup but slower per-word time.
>>> from binsearch import Speller
>>> import urllib, os
>>> s = Speller()
>>> s.cached_load('bigdict',
... lambda: (open('big.txt') if os.path.exists('big.txt')
... else urllib.urlopen('http://norvig.com/big.txt')).read())
>>> list(s.proofread('gort'))[0].suggestions
['got', 'sort', 'port']
"""
import bisect, collections, cPickle, heapq, operator, re
## s = Speller()
## s.train('the apple is the curse of god')
## for x in s.proofread('i am the devil'): print x
#. (at 0: i -> is)
#. (at 2: am -> of/is)
#. (at 9: devil -> )
class Speller:
def __init__(self):
self._words, self._counts = [], []
def cached_load(self, filename, get_corpus): # XXX too-fancy interface
try:
self.load(open(filename))
except IOError:
self.train(get_corpus())
self.save(open(filename, 'w'))
def train(self, text):
# TODO: make incremental training efficient
old = dict(zip(self._words, self._counts))
counter = collections.Counter(word for pos, word in words(text))
self._words.extend(counter.keys())
self._words.sort()
self._counts = [old.get(w, 0) + counter[w] for w in self._words]
def save(self, file):
cPickle.dump((self._words, self._counts), file)
def load(self, file):
self._words, self._counts = cPickle.load(file)
def proofread(self, text):
for pos, word in words(text):
i = bisect.bisect_left(self._words, word)
if 0 <= i < len(self._words) and self._words[i] == word: continue
edits = self._edits('', word, 1) or self._edits('', word, 2)
suggestions = heapq.nlargest(3, edits, key=self._get_count)
yield Mistake(self, pos, word, suggestions)
def _get_count(self, word):
i = bisect.bisect_left(self._words, word)
if i < len(self._words) and self._words[i] == word:
return self._counts[i]
else:
return 0
def _has_prefix(self, head):
i = bisect.bisect_left(self._words, head)
return i < len(self._words) and self._words[i].startswith(head)
def _edits(self, head, tail, distance):
# Return a set of the words in self._words that are at the
# given edit distance from head+tail (with edits restricted to
# the tail and to deletions on the right of the head).
# Precondition: head is '' or a prefix of a word in self._words
if 0 == distance:
return [head+tail] if self._get_count(head+tail) else []
exts = [head + c for c in alphabet if self._has_prefix(head + c)]
deletes = ([] if not head else
[self._edits(head[:-1], tail, distance - 1)])
inserts = [self._edits(p, tail, distance - 1) for p in exts]
if not tail:
return flatten(deletes + inserts)
transposes = ([self._edits(head, tail[1]+tail[0]+tail[2:], distance-1)]
if 2 <= len(tail) and tail[0] != tail[1]
else [])
replaces = [self._edits(p, tail[1:],
distance if p[-1] == tail[0] else distance-1)
for p in exts]
return flatten(deletes + transposes + replaces + inserts)
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def flatten(xss):
return reduce(operator.or_, map(set, xss), set())
class Mistake:
def __init__(self, corrector, position, word, suggestions):
self.corrector = corrector
self.position = position
self.word = word
self.suggestions = suggestions
def should_have_been(self, word):
if word not in self.suggestions:
self.corrector.train(word)
def __repr__(self):
return ("(at %d: %s -> %s)"
% (self.position, self.word,
'/'.join(self.suggestions)))
def words(text):
return ((m.start(), m.group(0))
for m in re.finditer(r'[a-z]+', text.lower()))