-
Notifications
You must be signed in to change notification settings - Fork 7
/
Spelling.py
219 lines (184 loc) · 8.2 KB
/
Spelling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import operator,os,pickle
from phonetics import metaphone
from Dictionary import Dictionary
#look at: http://www.korokithakis.net/node/87 (useful page talking about levenshtein distance... and has a link to http://norvig.com/spell-correct.html, very useful)
#for a better, in-context algorithm: http://l2r.cs.uiuc.edu/~danr/Papers/Abstracts/spellJ.html
class Spelling:
def __init__(self):
self.alphabet='abcdefghijklmnopqrstuvwxyz'
self.guten="data/gutenburg_small.txt"
self.guten_pickle="data/gutenburg_small.pickle"
#self.american="words/american-english"
self.gutenburg={}
self.learned={}
self.dictionary=Dictionary("usa")
self.stopwords=Dictionary("stopwords")
#self.Load_dictionary()
self.Load_gutenburg()
self.Load_learned()
def Load_gutenburg(self):
with open(self.guten,encoding='utf-8') as dictionary_file:
for line in dictionary_file:
words=line.strip().split(" ")
length=len(words)
if length == 2:
self.gutenburg[words[1]]=words[0]
#def Load_dictionary(self):
# with open('words/american-english', encoding='utf-8') as dictionary_file:
# for line in dictionary_file:
# word=line.strip()
# length=len(word)
#
# if length > 1:
# self.dictionary.append(word)
def Load_learned(self):
"""
Load the metaphone array, and if it doesn't exist, create it.
"""
if len(self.learned) == 0:
if os.path.exists(self.guten_pickle):
with open(self.guten_pickle, 'rb') as infile:
self.learned=pickle.load(infile)
else:
if len(self.gutenburg) == 0:
self.Load_dict()
for word,times in self.gutenburg.items():
meta=metaphone(word.replace("'",""))
#add and up the frequency of the words
if meta not in self.learned:
self.learned[meta]={word:int(times)}
elif word not in self.learned[meta]:
self.learned[meta][word]=int(times)
if len(self.learned) > 0:
with open(self.guten_pickle, 'wb') as outfile:
pickle.dump(self.learned, outfile)
def slight_edits(self,word):
"""
Find all edits within one character of the word.
"""
splits=[(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes=[a + b[1:] for a, b in splits if b]
transposes=[a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
replaces=[a + c + b[1:] for a, b in splits for c in self.alphabet if b]
inserts=[a + c + b for a, b in splits for c in self.alphabet]
return set(deletes + transposes + replaces + inserts)
def letters_off(self,word):
"""
Find all edits within two characters of the word.
"""
return set(e2 for e1 in self.slight_edits(word) for e2 in self.slight_edits(e1))
def known_guten(self,words):
"""
Determine if the word is in the gutenburg dictionary file.
"""
return set(w for w in words if w in self.gutenburg and int(self.gutenburg[w]) > 100)
def known_usa(self,words):
"""
Determine if the word is in the USA dictionary file.
"""
return set(w for w in words if self.dictionary.Contains(w) == True)
def highest_likely(self,words):
"""
Find the word out of 'words' with the highest frequency in the 'self.gutenburg' array Used in max_look_like and Check
"""
values={}
likely=[]
#get the frequency of the word
for i in words:
if i in self.gutenburg:
values[i]=int(self.gutenburg.get(i))
#sort by the frequency
sort=sorted(values.items(),key=operator.itemgetter(1),reverse=1)
#create an array of the sorted words
for value in sort:
if len(value) == 2:
likely.append(value[0])
#return the array
if len(likely) > 0:
return likely
else:
return False
def max_look_like(self,word,fast=False):
"""
Find all the words that are likely to be mistakes from when the user knows how to spell it but accidently types it in wrong.
"""
if fast==True:
words=self.known_usa([word]) or self.known_usa(self.slight_edits(word)) or self.known_usa(self.letters_off(word)) or [word]
else:
words=self.known_usa([word]) or self.known_usa(self.slight_edits(word)) or self.known_usa(self.letters_off(word)) or self.known_guten(self.slight_edits(word)) or self.known_guten(self.letters_off(word)) or [word]
return self.highest_likely(words)
def max_sound_like(self,word):
"""
Find all the words that are likely to be mistakes from the user not knowing how to spell it but knowing how it sounds.
"""
meta=metaphone(word)
rewords=[]
if meta in self.learned:
words=sorted(self.learned[meta].items(),key=operator.itemgetter(1),reverse=1)
if word not in [i[0] for i in words]:
if len(words) == 1:
rewords.append(words[0][0])
else:
rewords+=[i[0] for i in words]
if len(rewords) > 0:
return rewords
else:
return False
def Check(self,word,dictionary=False,fast=False):
"""
Figure out what the user probably is looking for.
word - the word to check if it is spelled correctly, and if it isn't, return the correct word
dictionary - if True, return a dictionary or list of possible corrections
"""
result=""
if len(word) > 0:
look=self.max_look_like(word.lower(),fast=fast)
sound=self.max_sound_like(word)
#if one of them is the only option, return the first one
if look == False and sound == False:
if dictionary==True:
result={}
else:
result=""
elif look == False:
if dictionary==True:
result=sound
else:
result=sound[0]
elif sound == False:
if dictionary==True:
result=look
else:
result=look[0]
#if both have options, find the words in both or find the highest_likely out of all the combined ones
else:
#see if the same word is in both lists
likely=[]
intersection=set(look)&set(sound)
if len(intersection) > 0:
likely=self.highest_likely(intersection)
else:
likely=self.highest_likely(look+[ent for key,ent in enumerate(sound) if key < 10 and self.dictionary.Contains(ent) == True and self.stopwords.Contains(ent) == False])
#if there is a likely word, return that word, otherwise let result = "", thus returning False
if dictionary == True:
result=likely
else:
if len(likely) > 0:
result=likely[0]
if result == "" or result == {}:
return False
else:
return result
def Frequency(self,word):
result=0
length=len(self.gutenburg)
if word in self.gutenburg and length > 0:
result=int(self.gutenburg.get(word))/length
return result
if __name__ == '__main__':
s=Spelling()
print(s.Check("thrue"))
print(s.Check("williage"))
print(s.Check("weedend"))
#works: print(s.Check("through"))
#works: print(s.Check("qwickly"))