In [1]:
import re
import string
from collections import Counter
import numpy as np

In [2]:
# we will create a list containing all the words in our text file
def read_corpus(filenames):
    words = []
    for filename in filenames:
        with open(filename, "r") as file:
            lines = file.readlines()
            for line in lines:
                words += re.findall(r'\w+', line.lower())
    return words

In [3]:
words = read_corpus(["../input/game-of-thrones-books/001ssb.txt","../input/game-of-thrones-books/002ssb.txt","../input/game-of-thrones-books/003ssb.txt"])
print(f"There are {len(words)} total words in the corpus")

In [4]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

In [5]:
word_counts = Counter(words)
print(f"the occurance of the word love in the text is {word_counts['love']}")
print(f"the occurance of the word winter in the text is {word_counts['winter']}")
print(f"the occurance of the word dark in the text is {word_counts['dark']}")

In [6]:
len(word_counts)

In [7]:
# now we will assigne for every word a probability based on his occurence in the text so that the word with more probability
# is chosen for the correction
total_word_count = float(len(words))
print(total_word_count)
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [8]:
print(word_probas["love"])
print(word_probas["winter"])
print(word_probas["dark"])

In [9]:
# first we split the words in all the possible combination so we can generate new wordsin diffrent formats
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word) + 1)]
print(split("winter"))

In [10]:
# here we use the previous split to delet one letter in all the possible positions
def delete(word):
    return [l + r[1:] for l,r in split(word) if r]
print(delete("winter"))

In [11]:
# the same thing but instead of delete we swap the first and seconde letter to have even more combinations
def swap(word):
    return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]
print(swap("winter"))

In [12]:
string.ascii_lowercase

In [13]:
# we replace the first letter in every split by all the letters in the alphabet
def replace(word):
    letters = string.ascii_lowercase
    return [l + c + r[1:] for l, r in split(word) if r for c in letters]
print(replace("winter"))

In [14]:
# the same thing but instead of replacing a letter we add a letter
def insert(word):
    letters = string.ascii_lowercase
    return [l + c + r for l, r in split(word) for c in letters]
print(insert("winter"))

In [15]:
# by using all this operations we will have the maximum possibilities of missspelt words 
def edit1(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))
print(len(edit1("winter")))
print(edit1("winter"))

In [16]:
# we apply this edit function 2 times to have even more combinations
def edit2(word):
    return set(e2 for e1 in edit1(word) for e2 in edit1(e1))
print(len(edit2("winter")))

In [22]:
# here we give a missspelt word and verify if it is in our vocab if not we apply edit2 which returns a list in this list 
# we will extract all the correct words the one with the higher probability is chosen
def correct_spelling(word, vocabulary, word_probabilities,nbr_of_edits):
    if word in vocabulary:
        print(f"{word} is already correctly spelt")
        return 
    if nbr_of_edits == 1:
        suggestions = edit1(word)
    else:
        suggestions = edit2(word)
    best_guesses = [w for w in suggestions if w in vocabulary]
    return [(w, word_probabilities[w]) for w in best_guesses]

In [23]:
word = "wintter"
corrections = correct_spelling(word, vocabs, word_probas,2)

if corrections:
    print(corrections)
    probs = np.array([c[1] for c in corrections])
    best_ix = np.argmax(probs)
    correct = corrections[best_ix][0]
    print(f"{correct} is suggested for {word}")

In [27]:
test_words = ["happi","understnad","magik","darc","smiile"]
for word in test_words:
    corrections = correct_spelling(word, vocabs, word_probas,2)
    if corrections:
        probs = np.array([c[1] for c in corrections])
        best_ix = np.argmax(probs)
        correct = corrections[best_ix][0]
        print(f"{correct} is suggested for {word}")

In [28]:
# the 2 edits is not the best choice always
test_words = ["happi","understnad","magik","darc","smiile"]
for word in test_words:
    corrections = correct_spelling(word, vocabs, word_probas,1)
    if corrections:
        probs = np.array([c[1] for c in corrections])
        best_ix = np.argmax(probs)
        correct = corrections[best_ix][0]
        print(f"{correct} is suggested for {word}")