In [3]:
#importing libraries

import re                       #for regular expression matching
from collections import Counter #for counting elements

In [4]:
'''
extract all the word from the corpus file 
and return as a list 
'''

def extract_words(text):
    return re.findall(r"[\w'-]+", text.lower())

In [5]:
#reading the corpus file and 
#count the occurrence of each word
#building the dictionary of words

WORDS_collections = Counter(extract_words(open('corpus 3.txt').read()))

In [6]:
'''
 showing the word collection and displaying the 
 the word and occurrrence as ket value pair
'''

for key, value in WORDS_collections.items():
    print(str(key) + ': '+ str(value))


the: 158806
projet: 111
gutenberg: 224
ebook: 175
of: 80233
golden: 34
bough: 6
third: 323
edition: 29
vol: 167
8: 113
2: 415
by: 12410
james: 119
george: 255
frazer: 3
this: 7952
is: 19529
for: 13672
use: 713
nyone: 91
anywhere: 67
at: 12289
no: 4634
cost: 109
and: 66665
with: 19461
almost: 557
restritions: 11
whatsoever: 15
you: 10655
may: 4451
copy: 73
it: 20358
give: 1039
away: 1411
or: 10697
re-use: 9
under: 1645
terms: 287
license: 68
included: 75
online: 31
t: 1328
http: 59
www: 41
org: 53
title: 79
golen: 18
12: 45
author: 49
jmes: 44
relese: 19
date: 83
march: 191
14: 56
2013: 1
42336: 1
lnguage: 43
english: 414
start: 113
project: 464
a: 38260
stuy: 59
in: 43614
magic: 16
religion: 70
d: 59
c: 203
l: 61
ll: 1075
litt: 1
fellow: 457
trinity: 7
college: 67
cmbridge: 5
professor: 17
soial: 35
anthropology: 1
university: 63
liverpool: 13
viii: 79
xii: 58
prt: 184
v: 103
spirits: 88
corn: 78
wild: 59
new: 2394
york: 389
nd: 8534
london: 135
mcmillan: 1
co: 9
912: 9
contents: 79
ch

In [7]:
#probability of occurance of a given word based on its frequency

def probability(word):

    #total number of occurance
    N = sum(WORDS_collections.values())
    
    #probability = number of occurrence / total number of occurrence
    return WORDS_collections[word]/N

In [8]:
'''
takes a list of word and return a set of words that appear
in the WORDS_collections dictionary
'''
# def words_in_dictionary(words):

    
#     return set(words) & WORDS_collections

def words_in_dictionary(words):

    #return the common words between the words and WORDS_collections
    return set(w for w in words if w in WORDS_collections)


In [9]:
'''
the follwoing function generates all possible correction_set that
are one correction away from given the wrong word
then it create all the correction 
''' 

def correction_1(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    correction_set = set()

    # peforming deletion
    deletes = [word[:i] + word[i+1:] for i in range(len(word))]
    correction_set.update(deletes)

    #performing transposition
    transposes = [word[:i] + word[i+1] + word[i] + word[i+2:] for i in range(len(word)-1)]
    correction_set.update(transposes)

    #performing Replacement
    replaces = [word[:i] + c + word[i+1:] for i in range(len(word)) for c in letters]
    correction_set.update(replaces)

    #performing Insertion
    inserts = [word[:i] + c + word[i:] for i in range(len(word)+1) for c in letters]
    correction_set.update(inserts)

    return correction_set

In [10]:
'''
the follwoing function generates all possible edits that
are two correction away from given the wrong word
then it create all the correction using correction_1()
'''

def correction_2(word): 
    "All edits that are two edits away from `word`."
    return (c2 for c1 in correction_1(word) for c2 in correction_1(c1))

In [11]:
'''
generate list of possible correction for given word
it use words_in_dictionary function to weather the word is already in the
dictionary otherwise it use correction_1, correction_2
'''

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (words_in_dictionary([word]) or words_in_dictionary(correction_1(word)) or words_in_dictionary(correction_2(word)) or [word])

In [12]:
'''
select the correction word with the highest probability
using probability()
'''

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=probability)

In [66]:
def text_pair(text):
    words = text.split()
    pair_of_text = []
    
    for word in words:
        if word.isalpha():
            correct_word = word
            pair_of_text.append((correct_word, word))
    
    return pair_of_text

# Generate test pairs for the error text
text_with_error = "Thiss is a sample text with some incorrect speling and gramatical mistakes. I am testing the spelling correction function. Hopfully, it will corect the errors and improve the accuracy of the text."
pair_of_text = text_pair(text_with_error)

total_tests = len(pair_of_text)
correct_count = 0
fail_count = 0

for correct_word, incorrect_word in pair_of_text:
    corrected_word = correction(incorrect_word)
    if corrected_word == correct_word:
        correct_count += 1
    else:
        fail_count += 1

print("Text with errors: ")
print(text_with_error)
print()

print("Corrected Text:")
corrected_text = ' '.join(correction(word) if word.isalpha() else word for word in text_with_error.split())
print(corrected_text.lower())



Text with errors: 
Thiss is a sample text with some incorrect speling and gramatical mistakes. I am testing the spelling correction function. Hopfully, it will corect the errors and improve the accuracy of the text.

Corrected Text:
hiss is a sample text with some incorrect speking and grmmatical mistakes. a am testing the spelling correction function. hopfully, it will correct the errors and improve the accuracy of the text.


In [65]:

accuracy = correct_count / total_tests
fail_rate = fail_count / total_tests

print("Experimental Results:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Fail Rate: {fail_rate * 100:.2f}%")

Experimental Results:
Accuracy: 82.14%
Fail Rate: 17.86%
