In [None]:
!pip install sacrebleu errant

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
from datasets import Dataset
import numpy as np
import torch

M2_EXTENSION = ".gold.bea19.m2"
MODELNAME = "t5-small"
PREFIX = "grammar: "

In [2]:
tokenizer = T5Tokenizer.from_pretrained(MODELNAME)
model = T5ForConditionalGeneration.from_pretrained(MODELNAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Quick test:

In [3]:
def test():
    text = "he go to hospital yesterday."
    input_text = PREFIX + text
    input_ids = tokenizer.encode(
        input_text, return_tensors="pt", max_length=128, truncation=True
    )

    output = model.generate(
        input_ids,
        max_length=128,
        num_beams=5,
        early_stopping=True,
        repetition_penalty=2.5,
    )

    corrected_text = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Original: {text}")
    print(f"Corrected: {corrected_text}")
test()

Original: he go to hospital yesterday.
Corrected: grammar: he go to hospital yesterday.


Load dataset

In [4]:
def parse_m2_file(file_path):
    sentences = []
    edits = []
    current_sentence = ""
    current_edits = []

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("S "):
                    if current_sentence:
                        sentences.append(current_sentence)
                        edits.append(current_edits)
                    current_sentence = line[2:]
                    current_edits = []
                elif line.startswith("A "):
                    current_edits.append(line[2:])

        if current_sentence:
            sentences.append(current_sentence)
            edits.append(current_edits)

    except Exception as e:
        print(f"Error parsing {file_path}: {e}")

    return sentences, edits


def load_m2_files(keys):
    df = pd.DataFrame(columns=["text", "edits", "level", "type"])
    for key in keys:
        m2_path = f"./{key}{M2_EXTENSION}"
        sentences, edits = parse_m2_file(m2_path)
        temp_df = pd.DataFrame({"text": sentences, "edits": edits})
        temp_df["level"] = key.split(".")[0]
        temp_df["type"] = key.split(".")[1]
        df = pd.concat([df, temp_df], ignore_index=True)

    return df.reset_index().rename(columns={"index": "id"})

keys = ["A.train", "A.dev"]
dataset_df = load_m2_files(keys)

In [5]:
dataset_df

Unnamed: 0,id,text,edits,level,type
0,0,My town is a medium size city with eighty thou...,[5 6|||R:OTHER|||- sized|||REQUIRED|||-NONE-|||0],A,train
1,1,It has a high density population because its s...,"[4 4|||M:PUNCT|||-|||REQUIRED|||-NONE-|||0, 7 ...",A,train
2,2,"Despite of it is an industrial city , there ar...",[0 1|||R:PREP|||Although|||REQUIRED|||-NONE-||...,A,train
3,3,I recommend visiting the artificial lake in th...,[8 9|||R:SPELL|||center|||REQUIRED|||-NONE-|||0],A,train
4,4,Pasteries are very common and most of them off...,[0 1|||UNK|||Pasteries|||REQUIRED|||-NONE-|||0...,A,train
...,...,...,...,...,...
11525,11525,Susan is a little dragon .,[-1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0],A,dev
11526,11526,"Her skin is colored red and green , red dots o...","[7 8|||R:PUNCT|||;|||REQUIRED|||-NONE-|||0, 10...",A,dev
11527,11527,She does that every day after school .,[-1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0],A,dev
11528,11528,"Of course , she has also a little brother .",[4 6|||R:WO|||also has|||REQUIRED|||-NONE-|||0],A,dev


Get corrected texts:

In [6]:
def get_corrected_texts_from_m2(dataset_df: pd.DataFrame):
    corrected_texts = []

    for _, row in dataset_df.iterrows():
        text = row["text"]
        edits = row["edits"]

        if not edits:
            corrected_texts.append(text)
            continue

        text_tokens = text.split()
        offset = 0

        for edit in edits:
            parts = edit.split("|||")
            if len(parts) < 3:
                continue

            span = parts[0].strip()
            correction = parts[2].strip()

            try:
                start, end = map(int, span.split())
                start += offset
                end += offset

                correction_tokens = correction.split() if correction != "-NONE-" else []
                text_tokens[start:end] = correction_tokens

                offset += len(correction_tokens) - (end - start)

            except ValueError:
                continue

        corrected_text = " ".join(text_tokens)
        corrected_texts.append(corrected_text)

    return corrected_texts
corrected_texts = get_corrected_texts_from_m2(dataset_df)

In [7]:
dataset_df["corrected_text"] = corrected_texts

In [8]:
print(dataset_df[["text", "corrected_text"]].iloc[0].to_list())

['My town is a medium size city with eighty thousand inhabitants .', 'My town is a medium - sized city with eighty thousand inhabitants .']


In [9]:
dataset_df

Unnamed: 0,id,text,edits,level,type,corrected_text
0,0,My town is a medium size city with eighty thou...,[5 6|||R:OTHER|||- sized|||REQUIRED|||-NONE-|||0],A,train,My town is a medium - sized city with eighty t...
1,1,It has a high density population because its s...,"[4 4|||M:PUNCT|||-|||REQUIRED|||-NONE-|||0, 7 ...",A,train,It has a high - density population because of ...
2,2,"Despite of it is an industrial city , there ar...",[0 1|||R:PREP|||Although|||REQUIRED|||-NONE-||...,A,train,"Although it is an industrial city , there are ..."
3,3,I recommend visiting the artificial lake in th...,[8 9|||R:SPELL|||center|||REQUIRED|||-NONE-|||0],A,train,I recommend visiting the artificial lake in th...
4,4,Pasteries are very common and most of them off...,[0 1|||UNK|||Pasteries|||REQUIRED|||-NONE-|||0...,A,train,Pasteries are very common and most of them off...
...,...,...,...,...,...,...
11525,11525,Susan is a little dragon .,[-1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0],A,dev,Susan is a little dragon .
11526,11526,"Her skin is colored red and green , red dots o...","[7 8|||R:PUNCT|||;|||REQUIRED|||-NONE-|||0, 10...",A,dev,Her skin is colored red and green ; red dots o...
11527,11527,She does that every day after school .,[-1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0],A,dev,She does that every day after school .
11528,11528,"Of course , she has also a little brother .",[4 6|||R:WO|||also has|||REQUIRED|||-NONE-|||0],A,dev,"Of course , she also has a little brother ."


In [10]:
train_dataset = Dataset.from_pandas(dataset_df[dataset_df["type"] == "train"][["id", "text", "corrected_text"]])
val_dataset = Dataset.from_pandas(dataset_df[dataset_df["type"] == "dev"][["id", "text", "corrected_text"]])

In [11]:
len(train_dataset), len(val_dataset)

(10493, 1037)

In [12]:
inputs_len = [len(tokenizer.encode("grammar: " + x["text"])) for x in train_dataset]
output_len = [len(tokenizer.encode(x["corrected_text"])) for x in train_dataset]

In [13]:
np.mean(inputs_len), np.mean(output_len), max(inputs_len), max(output_len)

(np.float64(24.501858381778327), np.float64(22.96206995139617), 248, 273)

In [14]:
len([len for len in inputs_len if len > 64])

235

In [15]:
max_length = 64

In [16]:
def preprocess(example):
    input_text = PREFIX + example["text"]
    target_text = example["corrected_text"]
    model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=max_length)
    labels = tokenizer(target_text, truncation=True, padding="max_length", max_length=max_length)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess, batched=False)
tokenized_val = val_dataset.map(preprocess, batched=False)

Map:   0%|          | 0/10493 [00:00<?, ? examples/s]

Map:   0%|          | 0/1037 [00:00<?, ? examples/s]

In [17]:
first_sample = tokenized_train[0]

decoded_input = tokenizer.decode(first_sample["input_ids"], skip_special_tokens=True)
decoded_target = tokenizer.decode(first_sample["labels"], skip_special_tokens=True)

print("Original Text:", first_sample["text"])
print("Corrected Text:", first_sample["corrected_text"])
print("Decoded Input:", decoded_input)
print("Decoded Target:", decoded_target)

Original Text: My town is a medium size city with eighty thousand inhabitants .
Corrected Text: My town is a medium - sized city with eighty thousand inhabitants .
Decoded Input: grammar: My town is a medium size city with eighty thousand inhabitants.
Decoded Target: My town is a medium - sized city with eighty thousand inhabitants.


In [35]:
import nltk

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Tokenize
    decoded_preds = [p.split() for p in decoded_preds]
    decoded_labels = [l.split() for l in decoded_labels]

    # Sentence-level GLEU
    scores = [nltk.translate.gleu_score.sentence_gleu([ref], hyp)
              for hyp, ref in zip(decoded_preds, decoded_labels)]
    return {"gleu": 100 * sum(scores) / len(scores)}

In [36]:
output_dir="./t5-grammar-corrector-3"
eval_strategy ="epoch"
learning_rate=3e-5
per_device_train_batch_size=32
per_device_eval_batch_size=32
weight_decay=0.0
save_total_limit=1
num_train_epochs=20
predict_with_generate=True
fp16=True if torch.cuda.is_available() else False
report_to="none"
label_smoothing_factor=0.1

In [37]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    eval_strategy=eval_strategy,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    weight_decay=weight_decay,
    save_total_limit=save_total_limit,
    num_train_epochs=num_train_epochs,
    predict_with_generate=predict_with_generate,
    fp16=fp16,
    report_to=report_to
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Gleu
1,No log,0.175943,61.330541
2,0.194600,0.165883,62.140772
3,0.194600,0.160072,62.52147
4,0.176400,0.155875,63.195736
5,0.164100,0.1538,63.245537
6,0.164100,0.152291,63.463405
7,0.155800,0.151774,63.562136
8,0.150300,0.14985,63.755806
9,0.150300,0.14907,63.678495
10,0.141800,0.148985,63.879484


('./t5-grammar-corrector-3/tokenizer_config.json',
 './t5-grammar-corrector-3/special_tokens_map.json',
 './t5-grammar-corrector-3/spiece.model',
 './t5-grammar-corrector-3/added_tokens.json')

In [38]:
eval_results = trainer.evaluate()

In [39]:
model = T5ForConditionalGeneration.from_pretrained(output_dir)
tokenizer = T5Tokenizer.from_pretrained(output_dir)

In [43]:
def correct_grammar(text: str):
    input_text = f"grammar: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    input_ids = input_ids.to(model.device)

    output_ids = model.generate(input_ids, max_length=256, num_beams=4)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

test_sentence = "The patient report severe dyspnea and bilateral lower extremity edema following administration of intravenous furosemide."
corrected = correct_grammar(test_sentence)
corrected

'The patient reported severe dyspnea and bilateral lower extremity edema following administration of intravenous furosemide.'

Evaluation

In [2]:
from evaluator import Evaluator

In [3]:
output_dir="./t5-grammar-corrector-3"

In [4]:
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
model_name = output_dir
evaluator = Evaluator(model_name, "data.csv")
evaluator.run_evaluation()
if "./" in model_name:
  model_name = model_name.split("./")[1]
evaluator.dataset.to_csv(
    f"./evaluation_results_{model_name}_{timestamp}.csv", index=False
)

Using GPU for T5 model
Evaluating row 1/204
Evaluating row 2/204
Evaluating row 3/204
Evaluating row 4/204
Evaluating row 5/204
Evaluating row 6/204
Evaluating row 7/204
Evaluating row 8/204
Evaluating row 9/204
Evaluating row 10/204
Evaluating row 11/204
Evaluating row 12/204
Evaluating row 13/204
Evaluating row 14/204
Evaluating row 15/204
Evaluating row 16/204
Evaluating row 17/204
Evaluating row 18/204
Evaluating row 19/204
Evaluating row 20/204
Evaluating row 21/204
Evaluating row 22/204
Evaluating row 23/204
Evaluating row 24/204
Evaluating row 25/204
Evaluating row 26/204
Evaluating row 27/204
Evaluating row 28/204
Evaluating row 29/204
Evaluating row 30/204
Evaluating row 31/204
Evaluating row 32/204
Evaluating row 33/204
Evaluating row 34/204
Evaluating row 35/204
Evaluating row 36/204
Evaluating row 37/204
Evaluating row 38/204
Evaluating row 39/204
Evaluating row 40/204
Evaluating row 41/204
Evaluating row 42/204
Evaluating row 43/204
Evaluating row 44/204
Evaluating row 45/