In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout
import pickle
import os
import json
from difflib import SequenceMatcher

In [None]:
class SinhalaGrammarChecker:
    def __init__(self, max_features=10000, max_length=100):
        self.max_features = max_features
        self.max_length = max_length
        self.tokenizer = Tokenizer(num_words=max_features)
        self.classifier = RandomForestClassifier(n_estimators=200, max_depth=20)
        self.neural_model = None
        self.correction_pairs = {}
        self.char_patterns = {}

    def extract_linguistic_features(self, text):
        """Extract linguistic features that can generalize to unseen data"""
        features = {}

        # 1. Basic text statistics
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())

        # 2. Character patterns (specific to Sinhala)
        features['vowel_modifier_count'] = len(re.findall(r'[්ා-ෟ]', text))
        features['consonant_count'] = len(re.findall(r'[ක-ෆ]', text))

        # 3. Structural features
        features['proper_ending'] = int(bool(re.search(r'[.?!]$', text)))

        # 4. N-gram features
        char_bigrams = [text[i:i+2] for i in range(len(text)-1)]
        char_trigrams = [text[i:i+3] for i in range(len(text)-2)]

        features['unique_bigram_ratio'] = len(set(char_bigrams)) / (len(char_bigrams) + 1)
        features['unique_trigram_ratio'] = len(set(char_trigrams)) / (len(char_trigrams) + 1)

        # 5. Syllable patterns
        features['syllable_count'] = len(re.findall(r'[ක-ෆ][්ා-ෟ]?', text))

        return features

    def create_feature_matrix(self, texts):
        """Convert texts to feature matrices"""
        feature_matrix = []
        for text in texts:
            features = self.extract_linguistic_features(text)
            feature_matrix.append(list(features.values()))
        return np.array(feature_matrix)

    def build_neural_model(self):
        """Build a neural network for sequence modeling"""
        input_layer = Input(shape=(self.max_length,))
        x = Embedding(self.max_features, 256)(input_layer)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        x = Dropout(0.2)(x)
        x = Bidirectional(LSTM(64))(x)
        x = Dropout(0.2)(x)
        x = Dense(64, activation='relu')(x)
        output = Dense(1, activation='sigmoid')(x)

        model = Model(input_layer, output)
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def analyze_issues(self, text):
        """Analyze specific grammatical issues"""
        issues = []

        # Check for basic structural issues
        if not re.search(r'[.?!]$', text):
            issues.append("Missing sentence ending punctuation")

        features = self.extract_linguistic_features(text)

        if features['syllable_count'] < 2:
            issues.append("Sentence seems too short")

        if features['unique_trigram_ratio'] > 0.9:
            issues.append("Unusual character combinations detected")

        return issues

    def train(self, X_train, X_seq_train, y_train, epochs=6):
        """Train both models"""
        # Train the feature-based classifier
        self.classifier.fit(X_train, y_train)

        # Train the neural model
        self.neural_model = self.build_neural_model()
        self.neural_model.fit(
            X_seq_train, y_train,
            epochs=epochs,
            batch_size=32,
            validation_split=0.2
        )

    # [Previous methods remain the same: prepare_correction_data, learn_char_patterns,
    # find_similar_sentences, apply_char_patterns, save_models, load_models, predict_and_correct]

    def prepare_correction_data(self, df):
        """Prepare correction data from training set"""
        # Store correction pairs
        incorrect_samples = df[df['input'] != df['target']]
        self.correction_pairs = dict(zip(incorrect_samples['input'], incorrect_samples['target']))

        # Learn character-level correction patterns
        self.learn_char_patterns(incorrect_samples)

    def learn_char_patterns(self, incorrect_samples):
        """Learn common character-level correction patterns"""
        for _, row in incorrect_samples.iterrows():
            incorrect = row['input']
            correct = row['target']

            # Find character-level differences
            for i in range(len(incorrect)):
                if i < len(correct) and incorrect[i] != correct[i]:
                    before_ctx = incorrect[max(0, i-2):i]
                    after_ctx = incorrect[i+1:i+3]
                    pattern_key = f"{before_ctx}|{incorrect[i]}|{after_ctx}"

                    if pattern_key not in self.char_patterns:
                        self.char_patterns[pattern_key] = {}

                    if i < len(correct):
                        correct_char = correct[i]
                        self.char_patterns[pattern_key][correct_char] = \
                            self.char_patterns[pattern_key].get(correct_char, 0) + 1

    def find_similar_sentences(self, text, n=3):
        """Find similar sentences from training data"""
        similar_pairs = []
        for incorrect, correct in self.correction_pairs.items():
            similarity = SequenceMatcher(None, text, incorrect).ratio()
            if similarity > 0.6:
                similar_pairs.append((incorrect, correct, similarity))

        return sorted(similar_pairs, key=lambda x: x[2], reverse=True)[:n]

    def apply_char_patterns(self, text):
        """Apply learned character patterns to suggest corrections"""
        suggestions = [text]

        for i in range(len(text)):
            before_ctx = text[max(0, i-2):i]
            after_ctx = text[i+1:i+3]
            pattern_key = f"{before_ctx}|{text[i]}|{after_ctx}"

            if pattern_key in self.char_patterns:
                corrections = sorted(
                    self.char_patterns[pattern_key].items(),
                    key=lambda x: x[1],
                    reverse=True
                )

                for correct_char, _ in corrections[:2]:
                    new_text = text[:i] + correct_char + text[i+1:]
                    if new_text not in suggestions:
                        suggestions.append(new_text)

        return suggestions

    def predict_and_correct(self, text):
        """Predict if text is correct and provide corrections if needed"""
        features = self.create_feature_matrix([text])
        sequence = self.tokenizer.texts_to_sequences([text])
        sequence_padded = pad_sequences(sequence, maxlen=self.max_length)

        feature_pred = self.classifier.predict_proba(features)[0][1]
        neural_pred = self.neural_model.predict(sequence_padded)[0][0]

        final_score = 0.4 * feature_pred + 0.6 * neural_pred
        is_correct = final_score > 0.5

        result = {
            'is_correct': is_correct,
            'confidence': float(final_score),
            'issues': self.analyze_issues(text),
            'corrections': []
        }

        if not is_correct:
            similar_pairs = self.find_similar_sentences(text)
            for _, correct, similarity in similar_pairs:
                if correct not in result['corrections']:
                    result['corrections'].append(correct)

            pattern_suggestions = self.apply_char_patterns(text)
            for suggestion in pattern_suggestions:
                if suggestion not in result['corrections']:
                    result['corrections'].append(suggestion)

        return result

    def save_models(self, save_dir='saved_models'):
        """Save all models and necessary data"""
        os.makedirs(save_dir, exist_ok=True)

        # Save Random Forest classifier
        with open(os.path.join(save_dir, 'classifier.pkl'), 'wb') as f:
            pickle.dump(self.classifier, f)

        # Save Neural model with .keras extension
        if self.neural_model:
            self.neural_model.save(os.path.join(save_dir, 'neural_model.keras'))

        # Save tokenizer
        with open(os.path.join(save_dir, 'tokenizer.pkl'), 'wb') as f:
            pickle.dump(self.tokenizer, f)

        # Save correction data
        correction_data = {
            'correction_pairs': self.correction_pairs,
            'char_patterns': self.char_patterns
        }
        with open(os.path.join(save_dir, 'correction_data.pkl'), 'wb') as f:
            pickle.dump(correction_data, f)

        # Save configuration
        config = {
            'max_features': self.max_features,
            'max_length': self.max_length
        }
        with open(os.path.join(save_dir, 'config.json'), 'w') as f:
            json.dump(config, f)

        print(f"Models and configuration saved to {save_dir}")

    def load_models(self, save_dir='saved_models'):
        """Load all saved models and data"""
        try:
            # Load configuration
            with open(os.path.join(save_dir, 'config.json'), 'r') as f:
                config = json.load(f)
                self.max_features = config['max_features']
                self.max_length = config['max_length']

            # Load Random Forest classifier
            with open(os.path.join(save_dir, 'classifier.pkl'), 'rb') as f:
                self.classifier = pickle.load(f)

            # Load Neural model with .keras extension
            self.neural_model = load_model(os.path.join(save_dir, 'neural_model.keras'))

            # Load tokenizer
            with open(os.path.join(save_dir, 'tokenizer.pkl'), 'rb') as f:
                self.tokenizer = pickle.load(f)

            # Load correction data
            with open(os.path.join(save_dir, 'correction_data.pkl'), 'rb') as f:
                correction_data = pickle.load(f)
                self.correction_pairs = correction_data['correction_pairs']
                self.char_patterns = correction_data['char_patterns']

            print("Models loaded successfully")
            return True
        except Exception as e:
            print(f"Error loading models: {str(e)}")
            return False

In [None]:
from google.colab import files
uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [None]:
def train_model():
    """Train and save the model"""
    checker = SinhalaGrammarChecker()

    # Load data
    df = pd.read_csv('dataset.csv')
    df['is_correct'] = (df['input'] == df['target']).astype(int)

    # Prepare correction data
    checker.prepare_correction_data(df)

    # Extract features
    X = checker.create_feature_matrix(df['input'])
    y = df['is_correct'].values

    # Prepare sequence data
    checker.tokenizer.fit_on_texts(df['input'])
    sequences = checker.tokenizer.texts_to_sequences(df['input'])
    X_seq = pad_sequences(sequences, maxlen=checker.max_length)

    # Split data
    X_train, X_test, X_seq_train, X_seq_test, y_train, y_test = train_test_split(
        X, X_seq, y, test_size=0.2, random_state=42
    )

    # Train models
    checker.train(X_train, X_seq_train, y_train)

    # Save models
    checker.save_models()

    return "Model trained and saved successfully!"

def test_sentence(text):
    """Load model and test a single sentence"""
    checker = SinhalaGrammarChecker()

    if checker.load_models():
        result = checker.predict_and_correct(text)
        return result
    else:
        return "Error: Could not load models. Please train the model first."


In [None]:
train_model()

Epoch 1/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 780ms/step - accuracy: 0.6299 - loss: 0.6609 - val_accuracy: 0.6386 - val_loss: 0.6625
Epoch 2/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 588ms/step - accuracy: 0.6180 - loss: 0.6730 - val_accuracy: 0.6386 - val_loss: 0.6420
Epoch 3/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 762ms/step - accuracy: 0.6329 - loss: 0.6469 - val_accuracy: 0.6386 - val_loss: 0.6407
Epoch 4/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 789ms/step - accuracy: 0.6652 - loss: 0.6164 - val_accuracy: 0.6627 - val_loss: 0.6319
Epoch 5/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 580ms/step - accuracy: 0.7115 - loss: 0.5759 - val_accuracy: 0.5542 - val_loss: 0.7145
Epoch 6/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 572ms/step - accuracy: 0.8481 - loss: 0.4193 - val_accuracy: 0.5783 - val_loss: 1.0045
Models and configuration saved

'Model trained and saved successfully!'

In [None]:
if __name__ == "__main__":
    while True:
        sentence = input("Enter a Sinhala sentence (or 'quit' to exit): ")
        if sentence.lower() == 'quit':
            break

        result = test_sentence(sentence)
        print("\nResults:")
        print(f"Is correct: {result['is_correct']}")
        print(f"Confidence: {result['confidence']:.2f}")

        if result['issues']:
            print("\nIssues found:")
            for issue in result['issues']:
                print(f"- {issue}")

        if not result['is_correct'] and result['corrections']:
            print("\nSuggested corrections:")
            print(result['corrections'][0])
        print("\n")

Enter a Sinhala sentence (or 'quit' to exit): අපි නිවසට ගමන් කරති
Models loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 520ms/step

Results:
Is correct: False
Confidence: 0.46

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected

Suggested corrections:
අපි නිවසට ගමන් කරති


Enter a Sinhala sentence (or 'quit' to exit): තාත්තා වාහනය එලවති
Models loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 542ms/step

Results:
Is correct: False
Confidence: 0.19

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected

Suggested corrections:
තාත්තා වාහනය එලවයි


Enter a Sinhala sentence (or 'quit' to exit): මල්ලි අම්මා සමඟ පොළට යමි
Models loaded successfully




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 519ms/step

Results:
Is correct: True
Confidence: 0.80

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected


Enter a Sinhala sentence (or 'quit' to exit): නිවාඩු දවසේ, මම යහළුවන් සමග සෙල්ලම් කිරීමට යමු
Models loaded successfully




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 539ms/step

Results:
Is correct: True
Confidence: 0.68

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected


Enter a Sinhala sentence (or 'quit' to exit): දකුණු පළාතේ නාකියාදෙණීය තෙල්ලඹුරේ ග්‍රාමයේ දී රත්න ශ්‍රී විජේසිංහ උපන්නේය
Models loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 540ms/step

Results:
Is correct: True
Confidence: 0.78

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected


Enter a Sinhala sentence (or 'quit' to exit): ගොවියෝ ගොයම් කපා කමතට අදියි
Models loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 541ms/step

Results:
Is correct: False
Confidence: 0.20

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected

Suggested corrections:
ගොවියෝ ගොයම් කපා කමතට අදිති


Enter a Sinhala sentence (or 'quit' to exit): අක්කා මල් න

In [None]:
if __name__ == "__main__":
    mode = input("Enter 'train' to train new model or 'test' to test sentences: ")

    if mode.lower() == 'train':
        print(train_model())
    elif mode.lower() == 'test':
        while True:
            sentence = input("Enter a Sinhala sentence (or 'quit' to exit): ")
            if sentence.lower() == 'quit':
                break

            result = test_sentence(sentence)
            print("\nResults:")
            print(f"Is correct: {result['is_correct']}")
            print(f"Confidence: {result['confidence']:.2f}")

            if result['issues']:
                print("\nIssues found:")
                for issue in result['issues']:
                    print(f"- {issue}")

            if not result['is_correct'] and result['corrections']:
                print("\nSuggested corrections:")
                # Print only the first correction
                print(f"1. {result['corrections'][0]}")
            print("\n")

Enter 'train' to train new model or 'test' to test sentences: test
Enter a Sinhala sentence (or 'quit' to exit): මම ඒ පිළිබඳ කතා කරාවි
Models loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 541ms/step

Results:
Is correct: True
Confidence: 0.72

Issues found:
- Missing sentence ending punctuation
- Unusual character combinations detected


Enter a Sinhala sentence (or 'quit' to exit): quit
