In [1]:
import string
import collections
import nltk
from nltk.tokenize import word_tokenize

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eghan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
def read_corpus(filename):
    with open(file=filename) as f:
        content = f.read()
    return content


In [19]:
def tokenize(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    return words

In [20]:
dataset_1 = tokenize(read_corpus('words_alpha.txt'))
dataset_2 = tokenize(read_corpus('The Complete Works of William Shakespeare.txt'))
dataset_3 = tokenize(read_corpus('Oxford English Dictionary.txt'))


In [21]:
print(f'Number of words in dataset 1 is {len(dataset_1)}')
print(f'Number of words in dataset 2 is {len(dataset_2)}')
print(f'Number of words in dataset 3 is {len(dataset_3)}')

Number of words in dataset 1 is 370110
Number of words in dataset 2 is 1243138
Number of words in dataset 3 is 1020139


In [23]:
full_data  = dataset_1 + dataset_2 + dataset_3


In [60]:
vocabs = set(full_data)

In [24]:
word_counts = collections.Counter(full_data)

In [28]:
total_word_count = float(sum(word_counts.values()))

In [32]:
word_probas = {word: word_counts[word]/total_word_count for word in word_counts.keys()}

In [42]:
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [44]:
def delete(word):
    return [left + right[1:] for left, right in split(word)]

In [52]:
def swap(word):
    return [left + right[1] + right[0] + right[2:] for left, right in split(word) if len(right)>1]

In [53]:
def replace(word):
    letters = string.ascii_lowercase
    return [left + c + right[1:] for left, right in split(word) if right for c in letters]

In [54]:
def insert(word):
    letters = string.ascii_lowercase
    return [left + c + right for left, right in split(word) for c in letters]

In [55]:
def level_one_edits(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [57]:
def level_two_edits(word):
    return set(edit2 for edit1 in level_one_edits(word) for edit2 in level_one_edits(edit1))

In [66]:
def correct_spelling(word, vocab, word_proba):
    if word in vocab:
        return f'{word} is correctly spelt'
    
    suggestion = level_one_edits(word) or level_two_edits(word) or [word]
    best_guesses = [w for w in suggestion if w in vocab]
    return [(w, word_proba[w]) for w in best_guesses]


In [75]:
correct_spelling('mashine', vocab=vocabs, word_proba=word_probas)

[('ashine', 3.797390964563887e-07),
 ('machine', 6.645434187986801e-05),
 ('mashie', 3.797390964563887e-07),
 ('mashing', 3.797390964563887e-07)]

In [74]:
correct_spelling('aearning', vocab=vocabs, word_proba=word_probas)


[('earning', 7.594781929127774e-07),
 ('learning', 2.2024867594470544e-05),
 ('yearning', 1.8986954822819434e-06)]