<a href="https://colab.research.google.com/github/geek770/RIO125-Automate-detection-and-recognition-of-grammatical-errors/blob/main/Grammer_Identification_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import AdamWeightDecay
from sklearn.metrics import accuracy_score
from tensorflow.keras.optimizers.schedules import PolynomialDecay

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to preprocess data, extracting sentences and labels
def preprocess_data(data):
    sentences = data['input'].tolist()
    labels = data['labels'].tolist()
    return sentences, labels

# Function to preprocess test data, tokenizing sentences
def preprocess_test_data(data, tokenizer, max_length):
    if isinstance(data, pd.DataFrame):
        sentences = data['input'].tolist()
    else:
        sentences = data
    tokenized_sentences = []

    # Tokenize each sentence and add it to the list
    for sentence in sentences:
        if pd.isnull(sentence):
            sentence = ""
        tokenized_sentence = tokenizer.encode_plus(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='tf',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_overflowing_tokens=False,
        )
        tokenized_sentences.append(tokenized_sentence)

    # Combine tokenized sentences into a single dictionary
    tokenized = {}
    for key in tokenized_sentences[0].keys():
        tokenized[key] = tf.concat([ts[key] for ts in tokenized_sentences], axis=0)

    return tokenized

# Function to train the BERT model
def train_bert_model(data_path, max_length=128, batch_size=32, epochs=3):
    # Load and preprocess the data
    data = load_data(data_path)
    sentences, labels = preprocess_data(data)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

    # Load BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize sentences
    X_train_tokenized = preprocess_test_data(X_train, tokenizer, max_length)
    X_test_tokenized = preprocess_test_data(X_test, tokenizer, max_length)

    # Load pre-trained BERT model for sequence classification
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

    # Define the optimizer with weight decay
    optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

    # Define the loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Train the model
    history = model.fit(
        X_train_tokenized,
        np.array(y_train),
        validation_split=0.2,
        batch_size=batch_size,
        epochs=epochs
    )

    # Evaluate the model
    y_pred_logits = model.predict(X_test_tokenized).logits
    y_pred = np.argmax(y_pred_logits, axis=1)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Save the trained model
    model.save("bert_model")

# Function to test the BERT model on new data
def test_bert_model(test_data_path, model_path, max_length=128, batch_size=32):
    # Load the pre-trained BERT model and tokenizer
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load the test data
    test_data = pd.read_excel(test_data_path)

    # Tokenize the test data
    X_test_tokenized = preprocess_test_data(test_data, tokenizer, max_length)

    # Load the weights of the pre-trained model
    model.load_weights(model_path)

    # Make predictions
    y_pred_logits = model.predict(X_test_tokenized).logits
    y_pred = np.argmax(y_pred_logits, axis=1)

    # Add predictions to the test data and save to CSV
    test_data['predicted_label'] = y_pred
    output_file = "output_labelled.csv"
    test_data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")


In [2]:

# Train BERT model
train_bert_model("input_data.csv")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3
Accuracy: 0.8417882788973824
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.76      0.82      2028
           1       0.81      0.91      0.86      2289

    accuracy                           0.84      4317
   macro avg       0.85      0.84      0.84      4317
weighted avg       0.85      0.84      0.84      4317



In [4]:


#load the model and test it
test_bert_model("test_data.xlsx", "bert_model")



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to output_labelled.csv


In [None]:
import nltk
nltk.download('punkt')


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')


In [10]:
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize

def check_grammar_errors(input_file_path, output_file_path):
    # Load the input CSV file
    input_df = pd.read_csv(input_file_path)

    # Initialize a list to store the results
    results = []

    # Iterate over each row in the input DataFrame
    for index, row in input_df.iterrows():
        # Skip NaN values
        if pd.isna(row['input']):
            continue

        # Print the sentence before tokenization for debugging
        print("Sentence before tokenization:", row['input'])

        # Tokenize the sentence
        sentence = row['input']
        tokens = word_tokenize(sentence)

        # Perform part-of-speech tagging
        pos_tags = pos_tag(tokens)

        # Initialize a list to store errors for the current sentence
        errors = []

        # Iterate over each token and check for grammar errors
        for i in range(len(pos_tags) - 1):
            # Check for subject-verb agreement error
            if pos_tags[i][1].startswith('N') and pos_tags[i+1][1].startswith('VB'):
                errors.append("Subject-Verb Agreement Error: '{}' should be '{}'".format(pos_tags[i+1][0], pos_tags[i+1][0]))

            # Check for article usage errors (e.g., a/an)
            if pos_tags[i][1] == 'DT' and pos_tags[i][0].lower() in ['a', 'an'] and pos_tags[i+1][1].startswith(('N', 'JJ')):
                errors.append("Article Usage Error: '{}' should be '{}'".format(pos_tags[i][0], 'an' if pos_tags[i+1][0].lower()[0] in 'aeiou' else 'a'))

            # Check for pluralization errors
            if pos_tags[i][1].startswith('NN') and pos_tags[i+1][0] == "'s":
                errors.append("Pluralization Error: Possessive form used for plural noun ('{}' should be '{}')".format(pos_tags[i][0], pos_tags[i][0]+'s'))

            # Check for past tense agreement error
            if pos_tags[i][1].startswith('V') and 'VBD' not in pos_tags[i][1] and 'VBN' not in pos_tags[i][1]:
                errors.append("Past Tense Agreement Error: '{}' should be in past tense".format(pos_tags[i][0]))

            # Check for present tense agreement error
            if pos_tags[i][1].startswith('VB') and 'VBG' not in pos_tags[i][1]:
                errors.append("Present Tense Agreement Error: '{}' should be in present tense".format(pos_tags[i][0]))

            # Rule 1: Subject-Verb Agreement
            if pos_tags[i][1].startswith('N') and pos_tags[i+1][1].startswith('VB'):
                errors.append("Subject-Verb Agreement Error: '{}' should be '{}'".format(pos_tags[i+1][0], pos_tags[i+1][0]))

            # Rule 2: Present Simple Tense
            if pos_tags[i][1] == 'VBP' and pos_tags[i][0].endswith('s'):
                errors.append("Present Simple Tense Error: '{}' should be '{}'".format(pos_tags[i][0], pos_tags[i][0][:-1]))

            # Rule 3: Present Continuous Tense
            if pos_tags[i][1] == 'VBG' and pos_tags[i][0] != 'am':
                errors.append("Present Continuous Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "am " + pos_tags[i][0]))

            # Rule 4: Present Perfect Tense
            if pos_tags[i][1] == 'VBN' and pos_tags[i][0] != 'been':
                errors.append("Present Perfect Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "have " + pos_tags[i][0]))

            # Rule 5: Present Perfect Continuous Tense
            if pos_tags[i][1] == 'VBG' and pos_tags[i][0] == 'been':
                errors.append("Present Perfect Continuous Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "been " + pos_tags[i][0]))

            # Rule 6: Past Simple Tense
            if pos_tags[i][1] == 'VBD' and not pos_tags[i][0].endswith('ed'):
                errors.append("Past Simple Tense Error: '{}' should be '{}'".format(pos_tags[i][0], pos_tags[i][0] + 'ed'))

            # Rule 7: Past Continuous Tense
            if pos_tags[i][1] == 'VBD' and pos_tags[i][0] == 'were':
                errors.append("Past Continuous Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "were " + pos_tags[i+1][0]))

            # Rule 8: Past Perfect Tense
            if pos_tags[i][1] == 'VBN' and pos_tags[i][0] != 'had':
                errors.append("Past Perfect Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "had " + pos_tags[i][0]))

            # Rule 9: Past Perfect Continuous Tense
            if pos_tags[i][1] == 'VBN' and pos_tags[i][0] == 'had':
                errors.append("Past Perfect Continuous Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "had been " + pos_tags[i+1][0]))

            # Rule 10: Future Simple Tense
            if pos_tags[i][1] == 'MD' and pos_tags[i][0] not in ['will', 'shall']:
                errors.append("Future Simple Tense Error: '{}' should be '{}'".format(pos_tags[i][0], "will " + pos_tags[i+1][0]))

            # Rule 11: Future Continuous Tense
            if pos_tags[i][1] == 'MD' and pos_tags[i][0] in ['will', 'shall']:
                errors.append("Future Continuous Tense Error: '{}' should be '{}'".format(pos_tags[i][0], pos_tags[i][0] + " be " + pos_tags[i+1][0]))

            # Rule 12: Future Perfect Tense
            if pos_tags[i][1] == 'MD' and pos_tags[i][0] in ['will', 'shall']:
                errors.append("Future Perfect Tense Error: '{}' should be '{}'".format(pos_tags[i][0], pos_tags[i][0] + " have " + pos_tags[i+1][0]))

            # Rule 13: Future Perfect Continuous Tense
            if pos_tags[i][1] == 'MD' and pos_tags[i][0] in ['will', 'shall']:
                errors.append("Future Perfect Continuous Tense Error: '{}' should be '{}'".format(pos_tags[i][0], pos_tags[i][0] + " have been " + pos_tags[i+1][0]))


        # Add the results to the list
        if row['predicted_label'] == 0:
            if len(errors) > 0:
                results.append((sentence, 0, errors))
            else:
                results.append((sentence, 0, "No Error"))
        else:
            results.append((sentence, 1, "No Error"))

    # Create a DataFrame from the results
    output_df = pd.DataFrame(results, columns=['input', 'label', 'error_type'])

    # Save the DataFrame to the output CSV file
    output_df.to_csv(output_file_path, index=False)

# Example usage:
check_grammar_errors("output_labelled.csv", "final_predictions_with_errortype.csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentence before tokenization: today i woke up at o ' clock and took a shower .
Sentence before tokenization: It is an enjoyment !
Sentence before tokenization: While waiting for them , I wrote a journal in lang - .
Sentence before tokenization: As soon as possible , I have to go to bed .
Sentence before tokenization: FINALLY , I made it !
Sentence before tokenization: After i finished my militery service , 
Sentence before tokenization: At p .
Sentence before tokenization: I am very fun !
Sentence before tokenization: T means quiet .
Sentence before tokenization: Be aware that customers ca not go to the th and th floors .
Sentence before tokenization: I want to go abroad but I do not have enough money and I do not have enough time .
Sentence before tokenization: I always write an essay after I come home from my part - time job and then I have no free time .
Sentence before tokenization: I came to visit a friend that is st