In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "Buntan/gec-t5-v1_1-small"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)


def correct_text(input_text: str) -> str:
    """
    Corrects the input text using the GEC model.

    Args:
        input_text (str): The input text to be corrected.

    Returns:
        corrected_text (str): The corrected text.
    """
    tokenized_sentence = tokenizer.encode(
        input_text,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    outputs = model.generate(
        tokenized_sentence,
        max_length=128,
        num_beams=5,
        early_stopping=True,
    )

    corrected_text = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    return corrected_text

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
# Examples from BEA 2019 (not in the training data)

input_texts = [
                "My favourite sport is volleyball because I love plays with my friends.",
                "I bornt to be a football player.",
                "The wall of my bedroom are white and the floor is dark grey.",
                "I like a many food and drink.",
                ]

for input_text in input_texts:
    print(f"Input Text: {input_text}")
    print(f"Correction: {correct_text(input_text)}")
    print()

Input Text: My favourite sport is volleyball because I love plays with my friends.
Correction: My favourite sport is volleyball because I love playing with my friends.

Input Text: I bornt to be a football player.
Correction: I born to be a football player.

Input Text: The wall of my bedroom are white and the floor is dark grey.
Correction: The wall of my bedroom is white and the floor is dark grey.

Input Text: I like a many food and drink.
Correction: I like a lot of food and drink.

