In [None]:
#importing libraries
import io                       #for input output
import re                       #for regular expression matching
from collections import Counter #for counting elements
import nltk                     #for tokenization
from nltk.tokenize import word_tokenize

In [None]:
# Download required NLTK data
nltk.download('punkt')          #

In [None]:
#extract string from the corpos file
#and return as a string
def read_corpus_file(file_path):
    text = ""
    with io.open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [None]:
'''
extract all the words from the corpus file 
and return as a list 
'''

def get_corpus_words(plain_text):
    corpus_words = []
    
    
    sentences = plain_text.split('.')
    
    for sentence in sentences:
        # Tokenizing method 1
        # Tokenize words within each sentence using split()
        words = sentence.split()
        
        # Separate words where there is a forward slash ("/") between them
        separated_words = []
        for word in words:
            if '/' in word:
                separated_words.extend(re.split(r'/', word))
            else:
                separated_words.append(word)
        
        # Tokenizing method 2
        # Tokenize words within each sentence using word_tokenize
        tokenized_words = word_tokenize(' '.join(separated_words).lower())
        
        # Remove non-alphabetical characters except hyphen and apostrophe
        cleaned_words = [re.sub('[^a-zA-Z\-\'\s]', '', word) for word in tokenized_words]
        
        # Filter out empty words
        filtered_words = [word for word in cleaned_words if word]
        
        # Append words to corpus_words
        corpus_words.extend(filtered_words)
    
    return corpus_words


In [None]:
#reading the corpus file as string
file_path = "corpus 3.txt"
plain_text = read_corpus_file(file_path)

print(len(plain_text))
plain_text[0:100]

In [None]:
'''
extract all the word from the plain text corpus file 
and return as a list 
'''

list_of_words = get_corpus_words(plain_text)
print(len(list_of_words))
print(list_of_words[0:20])
print(list_of_words[-20:])

In [None]:
#count the occurrence of each word
#building the dictionary of words

WORDS_collections = Counter(list_of_words)
WORDS_collections

In [None]:
'''
Define Spell Checker class
Doing all the calculation like probability, words_in_dictionary words, 
edits, correction and generating candidates
'''

class SpellChecker:
    
    def __init__(self, term_freq):
        self.w_rank = {}
        self.letters = 'abcdefghijklmnopqrstuvwxyz'
        
        N = sum(term_freq.values())
        for term in term_freq:
            self.w_rank[term] = term_freq[term] / N
    
    #probability of occurance of a given word based on its frequency
    def probability(self, word): 
        return self.w_rank.get(word, 0)


    '''
    takes a list of word and return a set of words that appear
    in the WORDS_collections dictionary
    '''
    def words_in_dictionary(self, words): 
        "The subset of 'words' that appear in the dictionary of w_rank."
        return set(w for w in words if w in self.w_rank)


    '''
    the follwoing function generates all possible correction_set that
    are one correction away from given the wrong word
    then it create all the correction 
    ''' 
    def correction_1(self, word):
        "All edits that are one edit away from 'word'."
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]

        # peforming deletion
        deletes    = [L + R[1:]               for L, R in splits if R]

        #performing transposition
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]

        #performing Replacement
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in self.letters]

        #performing Insertion
        inserts    = [L + c + R               for L, R in splits for c in self.letters]
        
        return set(deletes + transposes + replaces + inserts)


    '''
    the follwoing function generates all possible edits that
    are two correction away from given the wrong word
    then it create all the correction using correction_1()
    '''
    def correction_2(self, word): 
        "All edits that are two edits away from 'word'."
        return (e2 for e1 in self.correction_1(word) for e2 in self.correction_1(e1))
    

    '''
    select the correction word with the highest probability
    using probability()
    '''
    def correction(self, word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key = self.probability)
    

    '''
    generate list of possible correction for given word
    it use words_in_dictionary function to weather the word is already in the
    dictionary otherwise it use correction_1, correction_2
    '''
    def candidates(self, word): 
        "Generate possible spelling corrections for word."
        return (self.words_in_dictionary([word]) or self.words_in_dictionary(self.correction_1(word)) or self.words_in_dictionary(self.correction_2(word)) or [word])
    

    #correct misspelled word in a sentence
    def correct_sentence(self, sentence):
        "Correct misspelled words in a sentence."
        corrected_sentence = []
        words = sentence.split()
        for word in words:
            corrected_word = self.correction(word.lower())
            corrected_sentence.append(corrected_word)
        return ' '.join(corrected_sentence)
    
    #counting how many word is corrected
    def count_changed_words(self, corrected_words, error_words):
        changed_count = 0
        for corrected_word in corrected_words:
            if corrected_word not in error_words:
                changed_count += 1
        return changed_count

In [None]:
#Creating the model
sp_model = SpellChecker(WORDS_collections)

In [None]:
# Get probability of the word 'the'
sp_model.probability('the')

In [None]:
WORDS_collections['denied']
WORDS_collections['Thiss']

In [None]:
# Get probability of the word 'unmentioned'
sp_model.probability('denied')

In [None]:
# Get words that exist in the dictionary
sp_model.words_in_dictionary(['the', 'unmentioned'])

In [None]:

sp_model.correction('siter')

In [None]:
# Get candidates for word 'wlak'
sp_model.candidates('wlak')

In [None]:
sp_model.correct_sentence('Thiss is a samplee textt with some incorect speling and gramatical mistkes. I am testiing the spelling corction function. Hopfully, it will corect the errrors and imrove the accuracy of the texxt.')

##### Before going to the accuracy part, there is some clarification to be made. Because to calculate the accuracy, a ground truth is required(must). Without a ground truth i can not compare  the corrected word. Also i can not compare the corrected with the corpus file as the corrected words are taken from the corpus file. If I do so the accuracy will be 100%. So to calculate the accuracy i took a misspelled sentence and a corrected sentence which is the fully corrected version of the misspelled sentence

##### So to calculate the accuracy i used another english dictinary as ground truth. i will provide the link of that. So what i did is that i checked the corrected word if that one exists in the ground truth. if the corrected word is in the ground truth then the word is corrected successfully otherwise not.

english dictionary link which i used as ground truth:  https://www-personal.umich.edu/~jlawler/wordlist
and i rename the file as ground_truth_from_online

In [None]:
#opening the ground truth file
ground_truth_file_path = "ground_truth_from_online.txt"
ground_truth_text = read_corpus_file(ground_truth_file_path)


In [None]:
ground_truth_words = ground_truth_words = sorted(set(word_tokenize(ground_truth_text.lower())))
print(len(ground_truth_words))


In [None]:
#using binary search to search a word in the ground truth file
def binary_search(word, ground_truth_words):
    low = 0
    high = len(ground_truth_words) - 1

    while low <= high:
        mid = (low + high) // 2
        mid_word = ground_truth_words[mid]

        if mid_word == word:
            return True
        elif mid_word < word:
            low = mid + 1
        else:
            high = mid - 1

    return False


In [None]:
#checking bu=inary search
binary_search('hello', ground_truth_words)

In [None]:
# Read the error file and correct it
error_file = "error_file.txt"
error_file = read_corpus_file(error_file)

# Get the unique words from the plain text
error_words = get_corpus_words(error_file)
list_of_words.sort()


total_error_count = 0
error_words_string = ""

for word in error_words:
    if not binary_search(word, list_of_words):
        total_error_count += 1
        error_words_string += word + " "

print("Number of error words:", total_error_count)
# print("Error words:", error_words_string)

corrected_text = sp_model.correct_sentence(error_file)
error_string_c = sp_model.correct_sentence(error_words_string)

#save the corrected file
corrected_file_path = "corrected_file.txt"
with open(corrected_file_path, "w") as corrected_file:
    corrected_file.write(corrected_text)

error_string_c_tokenize = word_tokenize(error_string_c)
# print(error_string_c_tokenize)

#calculating the accuracy
correct_count = 0
for word in error_string_c_tokenize:
    # print(word)
    if binary_search(word, ground_truth_words):
        correct_count += 1
# print(correct_count)
accuracy = (correct_count/total_error_count) * 100

print("Error text: \n" + str(error_file))
print()
print("Corrected text: \n" + str(corrected_text))
print()
print('Accuracy = ' + str(accuracy) + '%')
