In [None]:
from datasets import load_dataset

dataset = load_dataset("jfleg")
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

def preprocess_function(examples):
    inputs = [f"grammar: {text}" for text in examples["sentence"]]
    targets = [corrections[0] for corrections in examples["corrections"]]  
    return {"input_text": inputs, "target_text": targets}

tokenized_dataset = dataset.map(preprocess_function, batched=True)


model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=512, truncation=True, padding="max_length")
    
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]  

    return model_inputs



tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)
print(tokenized_dataset["validation"][0])

{'sentence': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'corrections': ['So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'So I think we could not live if older people did not develop science and technologies . ', 'So I think we can not live if old people could not find science and technologies and they did not develop . ', 'So I think we can not live if old people can not find the science and technology that has not been developed . '], 'input_text': 'grammar: So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'target_text': 'So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'input_ids': [19519, 10, 264, 27, 317, 62, 54, 59, 619, 3, 99, 625, 151, 228, 59, 253, 108, 1433, 7, 11, 3, 5822, 29, 4137, 7, 11, 79, 410, 59, 1344, 3138, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

In [1]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./writing-coach-t5",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
)

In [4]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["validation"],  
    eval_dataset=tokenized_dataset["test"],         
    tokenizer=tokenizer
)

  trainer = Trainer(


In [5]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.832429
2,No log,0.773596
3,No log,0.759046


TrainOutput(global_step=285, training_loss=1.1567524157072369, metrics={'train_runtime': 6412.4547, 'train_samples_per_second': 0.353, 'train_steps_per_second': 0.044, 'total_flos': 306549180334080.0, 'train_loss': 1.1567524157072369, 'epoch': 3.0})

In [8]:
model.save_pretrained("./t5-grammar-correction")
tokenizer.save_pretrained("./t5-grammar-correction")

('./t5-grammar-correction\\tokenizer_config.json',
 './t5-grammar-correction\\special_tokens_map.json',
 './t5-grammar-correction\\spiece.model',
 './t5-grammar-correction\\added_tokens.json')

In [9]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.7590463161468506, 'eval_runtime': 485.4272, 'eval_samples_per_second': 1.541, 'eval_steps_per_second': 0.194, 'epoch': 3.0}


In [32]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


model_path = "./t5-grammar-correction"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)



In [7]:
def correct_grammar(text):
    try:
        model.eval()  
        input_text = f"grammar :{text}"
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding=True
        )
        
        print("Decoded Input:", tokenizer.decode(inputs["input_ids"][0]))  # Debug

        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=100,  
            num_beams=5,
            num_return_sequences=1,
            early_stopping=True
        )
        
        if outputs is None or len(outputs) == 0:
            return "No correction generated."

        corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return corrected_text

    except Exception as e:
        print(f"Error: {e}")
        return text  


In [15]:
correct_grammar("He have a bike")

Decoded Input: grammar :He have a bike</s>


'He has a bike'

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "gpt2"  
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [29]:
from transformers import TrainingArguments
# Preprocess dataset
def preprocess_function(examples):
    inputs = [f"Correct the following sentences: {text}" for text in examples["sentence"]]
    targets = [corrections[0] for corrections in examples["corrections"]]  # Use the first correction
    return {"input_text": inputs, "target_text": targets}

tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(tokenized_dataset["validation"][0])  
def tokenize_function(examples):
    # Tokenize input text
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=512,
            truncation=True,
            padding="max_length"
        )
    
  
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)
print(tokenized_dataset["validation"][0])  

Map:   0%|          | 0/755 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

{'sentence': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'corrections': ['So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'So I think we could not live if older people did not develop science and technologies . ', 'So I think we can not live if old people could not find science and technologies and they did not develop . ', 'So I think we can not live if old people can not find the science and technology that has not been developed . '], 'input_text': 'Correct the following sentences: So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'target_text': 'So I think we would not be alive if our ancestors did not develop sciences and technologies . '}


Map:   0%|          | 0/755 [00:00<?, ? examples/s]



Map:   0%|          | 0/748 [00:00<?, ? examples/s]

{'sentence': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'corrections': ['So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'So I think we could not live if older people did not develop science and technologies . ', 'So I think we can not live if old people could not find science and technologies and they did not develop . ', 'So I think we can not live if old people can not find the science and technology that has not been developed . '], 'input_text': 'Correct the following sentences: So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'target_text': 'So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'input_ids': [42779, 262, 1708, 13439, 25, 1406, 314, 892, 356, 460, 407, 2107, 611, 1468, 661, 714, 407, 1064, 264, 10035, 290, 573, 31522, 5823, 290, 484, 750, 407

In [44]:

training_args = TrainingArguments(
    output_dir="./gpt2-grammar-correction1",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_steps=10,
    push_to_hub=False
)

In [113]:
from transformers import Trainer
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["validation"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

In [31]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.278,0.257194
2,0.2504,0.252608
3,0.2666,0.251805


TrainOutput(global_step=285, training_loss=0.35968790347116036, metrics={'train_runtime': 16212.574, 'train_samples_per_second': 0.14, 'train_steps_per_second': 0.018, 'total_flos': 591826452480000.0, 'train_loss': 0.35968790347116036, 'epoch': 3.0})

In [33]:
model.save_pretrained("./gpt2-grammar-correction")
tokenizer.save_pretrained("./gpt2-grammar-correction")

('./gpt2-grammar-correction\\tokenizer_config.json',
 './gpt2-grammar-correction\\special_tokens_map.json',
 './gpt2-grammar-correction\\vocab.json',
 './gpt2-grammar-correction\\merges.txt',
 './gpt2-grammar-correction\\added_tokens.json')

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "./gpt2-grammar-correction"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token


In [3]:
def correct_grammar(text):
    input_text = f"Correct the sentence:{text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512,padding=True, truncation=True)
    leng = len(input_sentence)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=8,
        min_length=5,  
        num_beams=3,  
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id

    )
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    corrected_text = corrected_text.replace(input_text, "").strip()
    corrected_text = corrected_text.split(".")[0] + "."

    return corrected_text if corrected_text else "[No output generated]"


input_sentence = "We is not strong."
corrected_sentence = correct_grammar(input_sentence)
print(f"Input: {input_sentence}")
print(f"Corrected: {corrected_sentence}")

Input: We is not strong.
Corrected: We are not strong.


In [2]:

from transformers import T5Tokenizer, T5ForConditionalGeneration


model_path = "./t5-grammar-correction"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)


In [3]:
def preprocess_function(examples):
    inputs = [f"grammar: {text}" for text in examples["sentence"]]
    targets = [corrections[0] for corrections in examples["corrections"]]  
    return {"input_text": inputs, "target_text": targets}

tokenized_dataset = dataset.map(preprocess_function, batched=True)
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=512, truncation=True, padding="max_length")
    
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]  

    return model_inputs



tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)
print(tokenized_dataset["validation"][0])

{'sentence': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'corrections': ['So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'So I think we could not live if older people did not develop science and technologies . ', 'So I think we can not live if old people could not find science and technologies and they did not develop . ', 'So I think we can not live if old people can not find the science and technology that has not been developed . '], 'input_text': 'grammar: So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'target_text': 'So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'input_ids': [19519, 10, 264, 27, 317, 62, 54, 59, 619, 3, 99, 625, 151, 228, 59, 253, 108, 1433, 7, 11, 3, 5822, 29, 4137, 7, 11, 79, 410, 59, 1344, 3138, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./writing-coach-t5",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
)

In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["validation"],  
    eval_dataset=tokenized_dataset["test"],         
    tokenizer=tokenizer
)

  trainer = Trainer(


In [6]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.71137
2,No log,0.691392
3,No log,0.685224


TrainOutput(global_step=285, training_loss=0.9537321391858553, metrics={'train_runtime': 9459.154, 'train_samples_per_second': 0.239, 'train_steps_per_second': 0.03, 'total_flos': 306549180334080.0, 'train_loss': 0.9537321391858553, 'epoch': 3.0})