In [2]:
import re
from collections import Counter

In [3]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [4]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('big.txt').read()))

In [5]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [6]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [7]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [8]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [9]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [10]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [11]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [12]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'morney', 'molnney', 'monhey', 'monnaey', 'mnney', 'moqney', 'monnel', 'monneyu', 'monneyy', 'monnhey', 'monnsey', 'monnuey', 'omonney', 'mowney', 'mgnney', 'lonney', 'monnegy', 'monneyc', 'monneoy', 'mocnney', 'monnem', 'mounney', 'monkney', 'monneyq', 'mdonney', 'monnet', 'wonney', 'eonney', 'moqnney', 'monnefy', 'tonney', 'vmonney', 'monaey', 'monnej', 'mosney', 'monnea', 'monnxey', 'fmonney', 'monnehy', 'wmonney', 'mononey', 'monwey', 'monnkey', 'monnepy', 'monley', 'monneyn', 'monnyey', 'moiney', 'monneby', 'monnecy', 'monneym', 'monneyw', 'conney', 'mojnney', 'moinney', 'gmonney', 'monnxy', 'mwnney', 'mxonney', 'molney', 'monneyt', 'mooney', 'mobney', 'mionney', 'monney', 'uonney', 'monnhy', 'mopney', 'mbonney', 'monzney', 'yonney', 'bmonney', 'monndy', 'oonney', 'muonney', 'menney', 'monnry', 'monneyb', 'xonney', 'monnez', 'monnley', 'maonney', 'monnef', 'monuney', 'mynney', 'monyney', 'monfey', 'monnbey', 'donney', 'monpey', 'monneuy', 'moniney', 'mownney', 'monnpy', 'monn

In [13]:
print(known(edits_one("monney")))

{'money', 'monkey'}


In [15]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))
print(known(edits_two("monney")))

51013
{'money', 'monkey'}
{'olney', 'monkeys', 'donne', 'tonne', 'bonnet', 'moines', 'morley', 'convey', 'donned', 'moaned', 'motley', 'donkey', 'bonny', 'manned', 'monday', 'bonne', 'monger', 'money', 'manner', 'honey', 'monkey'}


In [17]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'money', 'monkey'}


In [16]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [17]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [18]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [38]:
import numpy as np 
  
# function to get unique values 
def unique(list1): 
    x = np.array(list1) 
    return (np.unique(x)) 

In [40]:
len(possible_corrections("emfasize"))

1

In [42]:
import nltk
nltk.download('tagsets')

nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data]   Unzipping help\tagsets.zip.
