In [1]:
import nltk
import string
from nltk.corpus import words
from nltk.metrics import edit_distance

nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Get a list of valid words and normalize casing
valid_words = set(words.words())
valid_words_lower = {word.lower() for word in valid_words}

# Step 2: Look at the first 20 words in the list
first_20_words = list(valid_words_lower)[:20]
print("First 20 words in the normalized list:", first_20_words)

# Step 3: Create a list of stop words including NLTK stopwords and punctuation
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.update(set(string.punctuation))

# Step 4: Define a function to correct a single term
def get_correct_term(term):
    if term.lower() in valid_words_lower:
        return term
    else:
        distances = {word: edit_distance(term, word) for word in list(valid_words_lower)[:20000]}
        sorted_distances = sorted(distances, key=distances.get)
        return sorted_distances[0]

# Step 8: Define a function for spelling correction in any given input sentence
def correct_spelling(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    corrected_sentence = [get_correct_term(token) if token not in stop_words else token for token in tokens]
    return ' '.join(corrected_sentence)

# Testing the function
input_sentence = "The new abacos is great"
corrected_sentence = correct_spelling(input_sentence)
print("Corrected sentence:", corrected_sentence)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\phani\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phani\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


First 20 words in the normalized list: ['foregate', 'ahimsa', 'unshapen', 'gamophagy', 'dissent', 'autocorrosion', 'plastrum', 'playbill', 'picus', 'featherway', 'presumptious', 'newsboy', 'khaiki', 'kha', 'brachydodrome', 'odontomous', 'bradyphemia', 'facient', 'unrevertible', 'combing']
Corrected sentence: the new abaton is great
