In [1]:
from textblob import Word

In [2]:
s = 'fianlly'
s

'fianlly'

In [3]:
w = Word(s)
w.spellcheck()

[('finally', 1.0)]

In [4]:
w.correct()

'finally'

In [5]:
s = 'flaot'
s

'flaot'

In [6]:
w = Word(s)
w.spellcheck()

[('flat', 0.85), ('float', 0.15)]

In [7]:
w.correct()

'flat'

In [8]:
s = 'fianlly'
s

'fianlly'

In [3]:
import re, collections

def tokens(text): 
    """
    Get all words from the corpus
    """
    return re.findall('[a-zA-Z]+', text.lower()) 

WORDS = tokens(open('big.txt').read())
WORD_COUNTS = collections.Counter(WORDS)
# top 10 words in corpus
WORD_COUNTS.most_common(10)

[('the', 80030),
 ('of', 40025),
 ('and', 38313),
 ('to', 28766),
 ('in', 22050),
 ('a', 21155),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [5]:
WORD_COUNTS.get('faintly')

6

In [10]:
def edits0(word): 
    """
    Return all strings that are zero edits away 
    from the input word (i.e., the word itself).
    """
    return {word}

In [11]:
edits0(s)

{'fianlly'}

In [12]:
def splits(word):
    """
    Return a list of all possible (first, rest) pairs 
    that the input word is made of.
    """
    return [(word[:i], word[i:]) 
            for i in range(len(word)+1)]

In [13]:
splits(s)

[('', 'fianlly'),
 ('f', 'ianlly'),
 ('fi', 'anlly'),
 ('fia', 'nlly'),
 ('fian', 'lly'),
 ('fianl', 'ly'),
 ('fianll', 'y'),
 ('fianlly', '')]

In [14]:
def edits1(word):
    """
    Return all strings that are one edit away 
    from the input word.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

In [15]:
print(list(edits1(s))[:20])

['fizanlly', 'fianlby', 'fsanlly', 'fiancly', 'fianllx', 'fianslly', 'ifanlly', 'fianylly', 'fpanlly', 'fianlsly', 'fiaplly', 'fianlkly', 'fianllq', 'fiaelly', 'fianlnly', 'bianlly', 'afianlly', 'faianlly', 'fsianlly', 'fianlhly']


In [16]:
def edits2(word):
    """Return all strings that are two edits away 
    from the input word.
    """
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

In [17]:
print(list(edits2(s))[:20])

['fganoly', 'faanllxy', 'fiabully', 'jfianzly', 'bfianlnly', 'fipynlly', 'fianljlyb', 'fihaznlly', 'ficanlyl', 'fianbllw', 'wianzlly', 'fzianylly', 'fisnllyb', 'foanlley', 'fiiagnlly', 'fianldls', 'fiarnlla', 'fiaynoly', 'fianmrlly', 'fianjnlly']


In [18]:
def known(words):
    """
    Return the subset of words that are actually 
    in our WORD_COUNTS dictionary.
    """
    return {w for w in words if w in WORD_COUNTS}

In [19]:
known(edits0(s))

set()

In [20]:
known(edits1(s))

{'finally'}

In [21]:
known(edits2(s))

{'faintly', 'finally', 'finely', 'frankly'}

### Your Turn: Use all the above functions to return the best possible correct spelling for a word

In [22]:
def spell_check(word):
    """
    Get the best correct spelling for the input word
    """
    # Priority is for edit distance 0, then 1, then 2
    # else defaults to the input word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=WORD_COUNTS.get)

In [26]:
s = 'fianlly'

In [27]:
spell_check(s)

'finally'

### Your Turn: Call the above function and correct a sentence

Think of using tokenization and calling the function on each token and joining them back to make a sentence

In [28]:
s =  'hwat a graet praogram!'
s

'hwat a graet praogram!'

In [29]:
' '.join(spell_check(w) for w in s.split())

'what a great program'