# Grammar Fix Bot

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import torch
import sqlite3

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Check if CUDA is available (i.e., if you have a compatible GPU)
if torch.cuda.is_available():
    print("CUDA is available. GPU will be used.")
else:
    print("CUDA is not available. Training will use CPU.")

CUDA is available. GPU will be used.


In [None]:
import sys
print(sys.version)

3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]


## Data preparation

#### Read scraped sentences

In [None]:
db_path = 'parser/sentences.db'
connection = sqlite3.connect(db_path)
cursor = connection.cursor()
cursor.execute("SELECT * FROM sentences;")
sentences = cursor.fetchall()
df = pd.DataFrame(sentences, columns=['id', 'Original', 'Altered', 'category'])
df = df.drop(['id','category'], axis=1)

In [None]:
df.rename(columns={'Original': 'target', 'Altered': 'input'}, inplace=True)
df = df[['input', 'target']]
df.head()

Unnamed: 0,input,target
0,"You may copy it, give it away or re-use it und...","You may copy it, give it away or re-use it und..."
1,"Down the Rabbit-Hole CHAPETR II, The Pool of ...","Down the Rabbit-Hole CHAPTER II,The Pool of T..."
2,"In another moment down went Alice after it, ne...","In another moment down went Alice after it, ne..."
3,After a time she herd a little pattering of fe...,After a time she heard a little pattering of f...
4,I wish I hadn't cried so much!,I wish I hadn’t cried so much!


## Data preprocessing

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
# https://huggingface.co/pszemraj/grammar-synthesis-small
# Load the tokenizer and model
model_path = "grammar-synthesis-small"  # Path from local folder
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
# Preprocess the data
def preprocess_function(examples):
    inputs = tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/80938 [00:00<?, ? examples/s]

Map: 100%|██████████| 80938/80938 [00:08<00:00, 9145.74 examples/s] 


In [None]:
# Split into training and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

## Build a model

In [None]:
import wandb
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="my_custom_run_name", 
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
# Fine-tune the model
trainer.train()

  4%|▎         | 500/13659 [06:45<2:55:38,  1.25it/s]

{'loss': 4.5828, 'grad_norm': 0.3901728093624115, 'learning_rate': 4.816970495643898e-05, 'epoch': 0.11}


  7%|▋         | 1000/13659 [13:28<2:50:33,  1.24it/s]

{'loss': 0.2211, 'grad_norm': 0.4652789533138275, 'learning_rate': 4.633940991287796e-05, 'epoch': 0.22}


 11%|█         | 1500/13659 [20:14<2:43:19,  1.24it/s]

{'loss': 0.2005, 'grad_norm': 0.38605397939682007, 'learning_rate': 4.4509114869316936e-05, 'epoch': 0.33}


 15%|█▍        | 2000/13659 [26:59<2:37:14,  1.24it/s]

{'loss': 0.1915, 'grad_norm': 0.2563535273075104, 'learning_rate': 4.2678819825755914e-05, 'epoch': 0.44}


 18%|█▊        | 2500/13659 [33:40<2:28:54,  1.25it/s]

{'loss': 0.1922, 'grad_norm': 0.36243537068367004, 'learning_rate': 4.084852478219489e-05, 'epoch': 0.55}


 22%|██▏       | 3000/13659 [40:22<2:23:23,  1.24it/s]

{'loss': 0.1807, 'grad_norm': 0.33316799998283386, 'learning_rate': 3.901822973863387e-05, 'epoch': 0.66}


 26%|██▌       | 3500/13659 [47:04<2:16:21,  1.24it/s]

{'loss': 0.1855, 'grad_norm': 0.31471121311187744, 'learning_rate': 3.718793469507285e-05, 'epoch': 0.77}


 29%|██▉       | 4000/13659 [53:45<2:10:08,  1.24it/s]

{'loss': 0.1776, 'grad_norm': 0.3814409673213959, 'learning_rate': 3.5357639651511826e-05, 'epoch': 0.88}


 33%|███▎      | 4500/13659 [1:00:28<2:03:12,  1.24it/s]

{'loss': 0.1774, 'grad_norm': 0.23864635825157166, 'learning_rate': 3.3527344607950804e-05, 'epoch': 0.99}


                                                        
 33%|███▎      | 4553/13659 [1:02:10<2:00:47,  1.26it/s]

{'eval_loss': 0.15592795610427856, 'eval_runtime': 59.3418, 'eval_samples_per_second': 136.396, 'eval_steps_per_second': 8.527, 'epoch': 1.0}


 37%|███▋      | 5000/13659 [1:08:09<1:55:41,  1.25it/s] 

{'loss': 0.175, 'grad_norm': 0.4109521210193634, 'learning_rate': 3.169704956438978e-05, 'epoch': 1.1}


 40%|████      | 5500/13659 [1:14:52<1:49:34,  1.24it/s]

{'loss': 0.1695, 'grad_norm': 0.30836573243141174, 'learning_rate': 2.986675452082876e-05, 'epoch': 1.21}


 44%|████▍     | 6000/13659 [1:21:34<1:42:19,  1.25it/s]

{'loss': 0.1709, 'grad_norm': 0.4615520238876343, 'learning_rate': 2.8036459477267734e-05, 'epoch': 1.32}


 48%|████▊     | 6500/13659 [1:28:16<1:35:49,  1.25it/s]

{'loss': 0.1626, 'grad_norm': 0.21574218571186066, 'learning_rate': 2.6206164433706715e-05, 'epoch': 1.43}


 51%|█████     | 7000/13659 [1:34:58<1:28:50,  1.25it/s]

{'loss': 0.1639, 'grad_norm': 0.2500581443309784, 'learning_rate': 2.4375869390145693e-05, 'epoch': 1.54}


 55%|█████▍    | 7500/13659 [1:41:40<1:23:12,  1.23it/s]

{'loss': 0.1666, 'grad_norm': 0.4270612895488739, 'learning_rate': 2.2545574346584668e-05, 'epoch': 1.65}


 59%|█████▊    | 8000/13659 [1:48:22<1:15:55,  1.24it/s]

{'loss': 0.1661, 'grad_norm': 0.2851518392562866, 'learning_rate': 2.071527930302365e-05, 'epoch': 1.76}


 62%|██████▏   | 8500/13659 [1:55:03<1:09:10,  1.24it/s]

{'loss': 0.1612, 'grad_norm': 0.28403419256210327, 'learning_rate': 1.8884984259462627e-05, 'epoch': 1.87}


 66%|██████▌   | 9000/13659 [2:01:45<1:03:40,  1.22it/s]

{'loss': 0.1621, 'grad_norm': 0.2436876893043518, 'learning_rate': 1.7054689215901605e-05, 'epoch': 1.98}


                                                        
 67%|██████▋   | 9106/13659 [2:04:09<59:38,  1.27it/s]

{'eval_loss': 0.14702457189559937, 'eval_runtime': 59.3102, 'eval_samples_per_second': 136.469, 'eval_steps_per_second': 8.531, 'epoch': 2.0}


 70%|██████▉   | 9500/13659 [2:09:26<55:19,  1.25it/s]   

{'loss': 0.1579, 'grad_norm': 0.1674908995628357, 'learning_rate': 1.522439417234058e-05, 'epoch': 2.09}




{'loss': 0.1664, 'grad_norm': 0.2247813642024994, 'learning_rate': 1.339409912877956e-05, 'epoch': 2.2}


 77%|███████▋  | 10500/13659 [2:22:51<42:44,  1.23it/s]  

{'loss': 0.1578, 'grad_norm': 0.3120589852333069, 'learning_rate': 1.1563804085218538e-05, 'epoch': 2.31}


 81%|████████  | 11000/13659 [2:29:32<35:27,  1.25it/s]

{'loss': 0.1569, 'grad_norm': 0.24390892684459686, 'learning_rate': 9.733509041657515e-06, 'epoch': 2.42}


 84%|████████▍ | 11500/13659 [2:36:13<28:50,  1.25it/s]

{'loss': 0.159, 'grad_norm': 0.2793842852115631, 'learning_rate': 7.903213998096494e-06, 'epoch': 2.53}


 88%|████████▊ | 12000/13659 [2:42:54<22:11,  1.25it/s]

{'loss': 0.156, 'grad_norm': 0.2732395827770233, 'learning_rate': 6.072918954535471e-06, 'epoch': 2.64}


 92%|█████████▏| 12500/13659 [2:49:34<15:26,  1.25it/s]

{'loss': 0.1636, 'grad_norm': 0.2520428001880646, 'learning_rate': 4.242623910974449e-06, 'epoch': 2.75}


 95%|█████████▌| 13000/13659 [2:56:14<08:50,  1.24it/s]

{'loss': 0.1607, 'grad_norm': 0.2811274528503418, 'learning_rate': 2.412328867413427e-06, 'epoch': 2.86}


 99%|█████████▉| 13500/13659 [3:02:54<02:07,  1.25it/s]

{'loss': 0.1557, 'grad_norm': 0.26778683066368103, 'learning_rate': 5.82033823852405e-07, 'epoch': 2.97}


                                                       
100%|██████████| 13659/13659 [3:06:02<00:00,  1.22it/s]

{'eval_loss': 0.14532490074634552, 'eval_runtime': 59.0261, 'eval_samples_per_second': 137.126, 'eval_steps_per_second': 8.572, 'epoch': 3.0}
{'train_runtime': 11162.3307, 'train_samples_per_second': 19.578, 'train_steps_per_second': 1.224, 'train_loss': 0.3328577863130626, 'epoch': 3.0}





TrainOutput(global_step=13659, training_loss=0.3328577863130626, metrics={'train_runtime': 11162.3307, 'train_samples_per_second': 19.578, 'train_steps_per_second': 1.224, 'total_flos': 1.0155755943493632e+16, 'train_loss': 0.3328577863130626, 'epoch': 3.0})

## Evaluate model on random sentence

In [None]:
# Example input text
text = "I has an bad grammar in this sentense."
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
outputs = model.generate(**inputs)
corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Corrected Sentence:", corrected_sentence)

Corrected Sentence: I have an bad grammar in this sentence.
