<a href="https://colab.research.google.com/github/dinuka-kasun-medis/mycolab/blob/main/nlp_with_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import words
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
# Data preparation
nltk.download('words')
english_words = words.words()

def generate_misspelled_words(words_list, num_misspelled=1):
    misspelled_words = []
    for word in words_list:
        for _ in range(num_misspelled):
            random_index = np.random.randint(len(word))
            misspelled_word = word[:random_index] + word[random_index + 1:]
            misspelled_words.append((misspelled_word, word))
    return misspelled_words

misspelled_data = generate_misspelled_words(english_words, num_misspelled=1)
misspelled_words, correct_words = zip(*misspelled_data)

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(correct_words)

misspelled_sequences = tokenizer.texts_to_sequences(misspelled_words)
correct_sequences = tokenizer.texts_to_sequences(correct_words)

max_seq_length = max(max(len(seq) for seq in misspelled_sequences),
                    max(len(seq) for seq in correct_sequences))

misspelled_sequences = pad_sequences(misspelled_sequences, maxlen=max_seq_length, padding='post')
correct_sequences = pad_sequences(correct_sequences, maxlen=max_seq_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1



[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [21]:
# Verify data shapes and types
print("Shape of misspelled_sequences:", misspelled_sequences.shape)
print("Shape of correct_sequences:", correct_sequences.shape)
print("Data type of correct_sequences:", correct_sequences.dtype)

Shape of misspelled_sequences: (236736, 24)
Shape of correct_sequences: (236736, 24)
Data type of correct_sequences: int32


In [22]:
# Define the model
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, 16, input_length=max_seq_length),
#     tf.keras.layers.LSTM(64),
#     tf.keras.layers.Dense(vocab_size, activation='softmax')  # Output units equal to vocab_size
# ])

# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 16, input_length=max_seq_length),
    tf.keras.layers.LSTM(64, return_sequences=True),  # Set return_sequences=True for TimeDistributed
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_size, activation='softmax'))
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
# Convert the target labels to integer type (required for sparse_categorical_crossentropy)
correct_sequences = correct_sequences.astype(np.int32)


In [24]:
# Training the model
model.fit(misspelled_sequences, correct_sequences, epochs=1, batch_size=32, validation_split=0.2) #me line eka.



<keras.callbacks.History at 0x7e6d0c139ab0>

In [26]:
def preprocess_word(word, tokenizer, max_seq_length):
    # Tokenize the input word into character-level sequence
    word_sequence = tokenizer.texts_to_sequences([word])
    # Pad the sequence to match the model's input length
    padded_sequence = pad_sequences(word_sequence, maxlen=max_seq_length, padding='post')
    return padded_sequence

def sequences_to_text(sequence, tokenizer):
    # Convert the sequence back to the word
    word = tokenizer.sequences_to_texts([sequence])[0]
    return word

def spell_check(word, tokenizer, max_seq_length):
    # Preprocess the input word
    input_sequence = preprocess_word(word, tokenizer, max_seq_length)
    # Predict the corrected word using the model
    predicted_sequence = model.predict(input_sequence)
    # Convert the sequence back to the word
    corrected_word = sequences_to_text(predicted_sequence[0], tokenizer)
    return corrected_word

# Example usage
input_word = "bannna"
corrected_word = spell_check(input_word, tokenizer, max_seq_length)
print(f"Original word: {input_word}")
print(f"Corrected word: {corrected_word}")




TypeError: ignored