In [17]:
# This example shows a simple spell check based
# upon distance calculation.
#
# Author: Fabrício Galende M. de Carvalho, DSc
# Reference: Text analytics with python, Sarkar
#
# Author: Fabrício Galende Marques de Carvalho

# -*- coding: utf-8 -*-

import re
from nltk.corpus import mac_morpho


import re, collections

def tokens(text): 
    """
    Get all words from the corpus. 
    In this case, special characters such as $ that appear in
    CR$ will be discarded (entire token discarded).
    """
    return re.findall('[a-z]+', text.lower()) 

# attention: not all tokens that are joined shall
# be considered as correct words. This is a simple
# didactic example. 

def edits0(word): ## Sarkar
    """
    Return all strings that are zero edits away 
    from the input word (i.e., the word itself).
    """
    return {word}



def edits1(word): ## Sarkar
    """
    Return all strings that are one edit away 
    from the input word.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    def splits(word):
        """
        Return a list of all possible (first, rest) pairs 
        that the input word is made of.
        """
        return [(word[:i], word[i:]) 
                for i in range(len(word)+1)]
                
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def known(words): 
    """
    Return the subset of words that are actually 
    in our WORD_COUNTS dictionary.
    """
    return {w for w in words if w in WORD_COUNTS}

def correct(word): 
    """
    Get the best correct spelling for the input word
    """
    # Priority is for edit distance 0, then 1, then 2
    # else defaults to the input word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or                   
                  [word])
    return max(candidates, key=WORD_COUNTS.get) # returns the most frequent word

misspelled_word = 'amenisar'

WORDS = tokens(' '.join(mac_morpho.words())) #builds a word token list from the corpus
WORD_COUNTS = collections.Counter(WORDS) # counts the frequency of each word token

# top 20 words in corpus
print("Most common word tokens: ", WORD_COUNTS.most_common(20))

print("\nMisspelled word: ", misspelled_word)
print("\nMac-Morpho word tokens after excluding non-word strings: ", WORDS[0:30])

# verifying the 1-edit distance word set:
print("\n\nGenerating 1-edit distance words from the original misspelled word: ")
print("Words that are 1 edit distance away from the original word: ", list(edits1(misspelled_word))[0:30])
print("-------------------")

print("Verifying which belong to text corpus: ")
print(known(edits1(misspelled_word)))

print("\nSelecting one which is most likely right:")

print(correct(misspelled_word))


Most common word tokens:  [('o', 100282), ('de', 88758), ('a', 73378), ('em', 34515), ('e', 22927), ('que', 20672), ('os', 18284), ('as', 11857), ('s', 11485), ('para', 11191), ('por', 10558), ('um', 9257), ('com', 9159), ('n', 8664), ('m', 7477), ('uma', 7309), ('se', 7015), ('es', 6930), ('mais', 4565), ('rio', 3960)]

Misspelled word:  amenisar

Mac-Morpho word tokens after excluding non-word strings:  ['jersei', 'atinge', 'm', 'dia', 'de', 'cr', 'milh', 'o', 'em', 'a', 'venda', 'de', 'a', 'pinhal', 'em', 's', 'o', 'paulo', 'programe', 'sua', 'viagem', 'a', 'a', 'exposi', 'o', 'nacional', 'do', 'zebu', 'que', 'come']


Generating 1-edit distance words from the original misspelled word: 
Words that are 1 edit distance away from the original word:  ['vmenisar', 'amenisao', 'dmenisar', 'imenisar', 'amsnisar', 'ameneisar', 'kmenisar', 'amenvisar', 'amenisaa', 'amtenisar', 'amenxisar', 'ahmenisar', 'ramenisar', 'vamenisar', 'hamenisar', 'amenisag', 'amonisar', 'amenisajr', 'amenosar', 'a