# Grammar Fix Bot

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import torch
import sqlite3

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Check if CUDA is available (i.e., if you have a compatible GPU)
if torch.cuda.is_available():
    print("CUDA is available. GPU will be used.")
else:
    print("CUDA is not available. Training will use CPU.")

CUDA is available. GPU will be used.


In [None]:
import sys
print(sys.version)

3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]


## Data preparation

#### Read scraped sentences

In [None]:
db_path = 'parser/sentences.db'
connection = sqlite3.connect(db_path)
cursor = connection.cursor()
cursor.execute("SELECT * FROM sentences;")
sentences = cursor.fetchall()
df = pd.DataFrame(sentences, columns=['id', 'Original', 'Altered', 'category'])
df = df.drop(['id','category'], axis=1)

In [None]:
df.rename(columns={'Original': 'target', 'Altered': 'input'}, inplace=True)
df = df[['input', 'target']]
df.head()

Unnamed: 0,input,target
0,"You may copy it, give it away or re-use it und...","You may copy it, give it away or re-use it und..."
1,"Down the Rabbit-Hole CHAPETR II, The Pool of ...","Down the Rabbit-Hole CHAPTER II,The Pool of T..."
2,"In another moment down went Alice after it, ne...","In another moment down went Alice after it, ne..."
3,After a time she herd a little pattering of fe...,After a time she heard a little pattering of f...
4,I wish I hadn't cried so much!,I wish I hadn’t cried so much!


## Data preprocessing

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
# Load the tokenizer and model
model_path = "grammar-synthesis-small"  # Path from local folder
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
# Preprocess the data
def preprocess_function(examples):
    inputs = tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/80938 [00:00<?, ? examples/s]

Map: 100%|██████████| 80938/80938 [00:08<00:00, 9145.74 examples/s] 


In [None]:
# Split into training and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]