# Build a NLP Language model to detect the sentence/word error in the text corpus.

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import words

In [8]:
# Download the words corpus (if not already downloaded)
nltk.download('words')

# Sample text with intentional errors
text = input("Enter the text :")

# Tokenize the text into words
tokens = word_tokenize(text)

# Get the set of English words from the nltk corpus
english_vocab = set(words.words())

# Check for misspelled words
misspelled_words = [word for word in tokens if word.lower() not in english_vocab]

print()

# Print misspelled words
if len(misspelled_words) > 0:
    print("Misspelled Words:")
    print(misspelled_words)
else:
    print("No misspelled words found.")


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\fahee\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Enter the text :I woke up this morning feeling an overwhelming sense of joy. The sun streamed through the window, casting a warm glow that filled me with happiness. As I stepped outside, a gentle breeze brushed against my skin, evoking a sense of calm and contentment. However, as the day progressed, a wave of nostalgia washed over me, reminding me of cherished memories from the past. I found myself smiling at old photographs, feeling a mix of joy and longing.  Suddenly, a pang of sadness hit me as I remembered missed opportunities and lost connections. Yet, hope flickered within me, like a small flame refusing to be extinguished. Determination surged through my veins, propelling me forward despite the obstacles. Later, an unexpected surprise lifted my spirits, filling me with excitement and anticipation for what lay ahead. Ultimately, today has been a whirlwind of emotions, each one leaving its mark on my heart.

Misspelled Words:
['.', 'streamed', ',', '.', ',', ',', 'evoking', '.', '

# Build a Language model to correct the error in the tex

In [12]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [16]:
# Sample data - input sentences and their corrected versions
sentences = [
    "I is going to the park.",
    "He have a blue car.",
    "She do not like ice cream."
]

corrected_sentences = [
    "I am going to the park.",
    "He has a blue car.",
    "She does not like ice cream."
]

# Tokenizing the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences + corrected_sentences)

sequences = tokenizer.texts_to_sequences(sentences)
corrected_sequences = tokenizer.texts_to_sequences(corrected_sentences)

vocab_size = len(tokenizer.word_index) + 1
max_len = max(max(len(seq) for seq in sequences), max(len(seq) for seq in corrected_sequences))

# Padding sequences to have uniform length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
padded_corrected_sequences = pad_sequences(corrected_sequences, maxlen=max_len, padding='post')

# Model architecture
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(padded_sequences, np.expand_dims(padded_corrected_sequences, -1), epochs=10, batch_size=32)

# Now, you can use this trained model to correct sentences by predicting the corrected sequence.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1ce36e183d0>