In [1]:
from datasets import load_dataset

# Load the JFLEG dataset
dataset = load_dataset("jfleg")
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"grammar: {text}" for text in examples["sentence"]]
    targets = [corrections[0] for corrections in examples["corrections"]]  
    return {"input_text": inputs, "target_text": targets}

tokenized_dataset = dataset.map(preprocess_function, batched=True)


model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=512, truncation=True, padding="max_length")
    
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]  

    return model_inputs



tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)
print(tokenized_dataset["validation"][0])


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


{'sentence': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'corrections': ['So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'So I think we could not live if older people did not develop science and technologies . ', 'So I think we can not live if old people could not find science and technologies and they did not develop . ', 'So I think we can not live if old people can not find the science and technology that has not been developed . '], 'input_text': 'grammar: So I think we can not live if old people could not find siences and tecnologies and they did not developped . ', 'target_text': 'So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'input_ids': [19519, 10, 264, 27, 317, 62, 54, 59, 619, 3, 99, 625, 151, 228, 59, 253, 108, 1433, 7, 11, 3, 5822, 29, 4137, 7, 11, 79, 410, 59, 1344, 3138, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

In [3]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./writing-coach-t5",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
)



In [4]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["validation"],  
    eval_dataset=tokenized_dataset["test"],         
    tokenizer=tokenizer
)

  trainer = Trainer(


In [5]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.832429
2,No log,0.773596
3,No log,0.759046


TrainOutput(global_step=285, training_loss=1.1567524157072369, metrics={'train_runtime': 6412.4547, 'train_samples_per_second': 0.353, 'train_steps_per_second': 0.044, 'total_flos': 306549180334080.0, 'train_loss': 1.1567524157072369, 'epoch': 3.0})

In [8]:
model.save_pretrained("./t5-grammar-correction")
tokenizer.save_pretrained("./t5-grammar-correction")

('./t5-grammar-correction\\tokenizer_config.json',
 './t5-grammar-correction\\special_tokens_map.json',
 './t5-grammar-correction\\spiece.model',
 './t5-grammar-correction\\added_tokens.json')

In [9]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.7590463161468506, 'eval_runtime': 485.4272, 'eval_samples_per_second': 1.541, 'eval_steps_per_second': 0.194, 'epoch': 3.0}


In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


model_path = "./t5-grammar-correction"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)



In [17]:
def correct_grammar(text):
    try:
        model.eval()  
        input_text = "grammar: I has an apple"
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding=True
        )
        
        print("Decoded Input:", tokenizer.decode(inputs["input_ids"][0]))  # Debug

        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=100,  
            num_beams=5,
            num_return_sequences=1,
            early_stopping=True
        )
        
        if outputs is None or len(outputs) == 0:
            return "No correction generated."

        corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return corrected_text

    except Exception as e:
        print(f"Error: {e}")
        return text  


In [18]:
print(correct_grammar("I has a apple.")) 

Decoded Input: grammar: I has an apple</s>
I have an apple
