<a href="https://colab.research.google.com/github/deemalvidarshana/Sinhala-spell-checker/blob/main/spellchecker%20website/Sinhala_Spelling_Checker_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# train_model.py
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pickle
import json
import os

# Define paths
DRIVE_BASE_PATH = '/content/drive/MyDrive/AI'
DICT_PATH = os.path.join(DRIVE_BASE_PATH, 'Sinhala_Dictionary.text')
MODEL_PATH = os.path.join(DRIVE_BASE_PATH, 'sinhala_spell_model.h5')
TOKENIZER_PATH = os.path.join(DRIVE_BASE_PATH, 'tokenizer.pickle')
PARAMS_PATH = os.path.join(DRIVE_BASE_PATH, 'params.json')

# Optimized parameters
max_len = 12  # Reduced from 15
embedding_dim = 32  # Increased from 16
lstm_units = 64  # Increased from 16
batch_size = 256  # Increased from 128
epochs = 5  # Increased from 3

# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    print("Not running in Colab, assuming local paths")

print(f"Loading dictionary from {DICT_PATH}...")
with open(DICT_PATH, 'r', encoding='utf-8') as file:
    dictionary = set(file.read().split())
print(f"Loaded {len(dictionary)} words")

# Create tokenizer
print("Creating tokenizer...")
tokenizer = Tokenizer(char_level=True)
all_text = ' '.join(dictionary)
tokenizer.fit_on_texts([all_text])

# Save tokenizer
print("Saving tokenizer...")
with open(TOKENIZER_PATH, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save parameters
params = {
    'max_len': max_len,
    'vocab_size': len(tokenizer.word_index) + 1
}
with open(PARAMS_PATH, 'w') as f:
    json.dump(params, f)

def create_optimized_model(vocab_size):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_len),
        LSTM(lstm_units, return_sequences=True),
        Dropout(0.2),
        LSTM(lstm_units//2),
        Dropout(0.2),
        Dense(vocab_size//2, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# Prepare training data more efficiently
print("Preparing training data...")
X = []
y = []

# Only use words between 3 and 20 characters
for word in dictionary:
    if 3 <= len(word) <= 20:
        for i in range(1, len(word)):
            seq = word[:i]
            next_char = word[i]

            seq_num = tokenizer.texts_to_sequences([seq])[0]
            next_char_num = tokenizer.texts_to_sequences([[next_char]])[0][0]

            if len(seq_num) <= max_len:  # Only add sequences within max_len
                seq_num = pad_sequences([seq_num], maxlen=max_len)[0]
                X.append(seq_num)
                y.append(next_char_num)

X = np.array(X)
y = np.array(y)

# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=2,
    restore_best_weights=True
)

# Train model
print("Training model...")
vocab_size = len(tokenizer.word_index) + 1
model = create_optimized_model(vocab_size)

# Split data into train and validation
split_idx = int(len(X) * 0.9)
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping]
)

print(f"Saving model to {MODEL_PATH}...")
model.save(MODEL_PATH)

# Print training results
final_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
print(f"\nFinal Training Accuracy: {final_acc:.4f}")
print(f"Final Validation Accuracy: {final_val_acc:.4f}")

print("\nTraining complete! Files saved:")
print(f"1. Model: {MODEL_PATH}")
print(f"2. Tokenizer: {TOKENIZER_PATH}")
print(f"3. Parameters: {PARAMS_PATH}")

In [None]:
!pip install flask flask-cors

In [None]:
import numpy as np
import pickle
import json
import os
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ipywidgets as widgets
from IPython.display import display, HTML

# Define paths
DRIVE_BASE_PATH = '/content/drive/MyDrive/AI'
MODEL_PATH = os.path.join(DRIVE_BASE_PATH, 'sinhala_spell_model.h5')
TOKENIZER_PATH = os.path.join(DRIVE_BASE_PATH, 'tokenizer.pickle')
PARAMS_PATH = os.path.join(DRIVE_BASE_PATH, 'params.json')

# Load tokenizer
with open(TOKENIZER_PATH, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load parameters
with open(PARAMS_PATH, 'r') as f:
    params = json.load(f)

max_len = params['max_len']
vocab_size = params['vocab_size']

# Load model
model = load_model(MODEL_PATH)

# Define functions
def predict_next_char(seq):
    seq_num = tokenizer.texts_to_sequences([seq])[0]
    seq_num = pad_sequences([seq_num], maxlen=max_len)
    pred = model.predict(seq_num, verbose=0)
    sorted_indices = np.argsort(pred[0])[::-1]  # Sort predictions by probability
    top_predictions = [(tokenizer.index_word[idx], pred[0][idx]) for idx in sorted_indices[:5]]
    return top_predictions

def correct_sentence(input_text):
    words = input_text.split()
    corrected_words = []

    for word in words:
        corrected_word = word
        for i in range(1, len(word)):
            seq = word[:i]
            top_predictions = predict_next_char(seq)
            next_char = top_predictions[0][0]  # Most likely next character

            if i < len(word) and next_char != word[i]:
                corrected_word = seq + next_char
        corrected_words.append(corrected_word)

    corrected_sentence = ' '.join(corrected_words)
    return corrected_sentence

# UI Functions
def on_check_button_clicked(change):
    input_text = text_area.value
    corrected_sentence = correct_sentence(input_text)

    corrected_output.value = corrected_sentence

# UI Components
header = widgets.HTML("""
<h1 style="text-align: center; color: #4a90e2; font-family: Arial, sans-serif;">Sinhala Spelling Checker</h1>
""")

# Larger input box
text_area = widgets.Textarea(
    value='',
    placeholder='Enter your text...',
    description='',
    layout=widgets.Layout(width='40%', height='150px', margin='0 auto', padding='10px', border_radius='5px')
)

check_button = widgets.Button(
    description='Check Text',
    button_style='primary',
    tooltip='Click to check spelling',
    style={'button_color': '#4a90e2', 'font_weight': 'bold'},
    icon='check',
    layout=widgets.Layout(margin='20px auto', width='15%')  # Adjusted width to make the button a bit larger
)
check_button.on_click(on_check_button_clicked)

# Larger corrected text box
corrected_output = widgets.Text(
    value='',
    placeholder='Corrected text will appear here...',
    description='Corrected Text:',
    style={'description_width': 'initial', 'font_weight': 'bold'},
    layout=widgets.Layout(width='40%', margin='0 auto', padding='10px', border='1px solid #4a90e2', border_radius='5px', background_color='#f2f2f2', font_weight='bold', height='50px')
)

# Layout
app_layout = widgets.VBox([
    header,
    text_area,
    check_button,
    corrected_output
], layout=widgets.Layout(align_items='center', justify_content='center', padding='20px'))

# Display UI
display(app_layout)
