In [1]:
#!pip install accelerate -U 
#  Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`
#!pip install transformers -U
#!pip install evaluate
#!pip install sacrebleu

In [1]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer
import datasets
from datasets import Dataset
from datasets import DatasetDict
import torch
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import accelerate
import os

## Load data into dictionary

In [4]:
chunk_length = 50
data = datasets.load_from_disk(r'JCLS2025_submission\gutenberg_subset\doc_60_chunk_'+str(chunk_length)+'.hf')
data

DatasetDict({
    train: Dataset({
        features: ['orig', 'shuffled'],
        num_rows: 14258
    })
    test: Dataset({
        features: ['orig', 'shuffled'],
        num_rows: 4753
    })
    valid: Dataset({
        features: ['orig', 'shuffled'],
        num_rows: 4753
    })
})

# GPU

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
#!nvidia-smi

## Tokenizer (T5)

In [4]:
checkpoint = "t5-large"
def load_tokenizer(checkpoint, model_max_length=512):
    # set scope to global to access from anywhere
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, fn_kwargs = {"model_max_length": model_max_length})
    return tokenizer

tokenizer = load_tokenizer(checkpoint, model_max_length=512)



### Create Preprocessing function to tokenize original and transformed data



In [5]:
def preprocess_function(examples, tokenizer, source_split="shuffled", target_split="orig", prefix="", max_length=512):
    sources = [prefix + example for example in examples[source_split]]
    targets = [example for example in examples[target_split]]
    model_inputs = tokenizer(sources, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

Dorp columns that are not the current randomization rate

In [6]:
def tokenize_data(data, tokenizer, source_split, target_split, prefix, max_length):
    tokenized_train_data = data["train"].map(preprocess_function, batched=True,
                                             fn_kwargs={"tokenizer": tokenizer, "source_split": source_split,
                                                        "target_split": target_split, "prefix": prefix, "max_length": max_length})
    tokenized_test_data = data["test"].map(preprocess_function, batched=True,
                                           fn_kwargs={"tokenizer": tokenizer, "source_split": source_split,
                                                        "target_split": target_split, "prefix": prefix, "max_length": max_length})
    tokenized_val_data = data["valid"].map(preprocess_function, batched=True,
                                                fn_kwargs={"tokenizer": tokenizer, "source_split": source_split,
                                                        "target_split": target_split, "prefix": prefix, "max_length": max_length})

    return tokenized_train_data, tokenized_test_data, tokenized_val_data
    
source_split = 'shuffled'
target_split = 'orig'
prefix = ''
max_length = 512
tokenized_train_data, tokenized_test_data, tokenized_val_data = tokenize_data(data, tokenizer, source_split, target_split, prefix, max_length)

Map:   0%|          | 0/9013 [00:00<?, ? examples/s]

Map:   0%|          | 0/1127 [00:00<?, ? examples/s]

Map:   0%|          | 0/1127 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [8]:
def load_metrics():
    metric1 = evaluate.load("sacrebleu")
    metric2 = evaluate.load("wer")
    metric3 = evaluate.load("rouge")
    return metric1, metric2, metric3

In [9]:
def postprocess_text_bleu(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def postprocess_text_wer(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds):

    preds, labels = eval_preds
    #print("predictions before: ", len(preds))
    #print("predictions before [0]: ", len(preds[0]))
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    #print("predictions decoded: ", len(decoded_preds))
    #print("predictions decoded [0]: ", len(decoded_preds[0]))

    #print("labels_before: ", labels[0])
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    #print("pad_token_id: ", tokenizer.pad_token_id)
    #print("labels: ", labels[0])
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #print("decoded_labels: ", decoded_labels[0])

    decoded_preds_bleu, decoded_labels_bleu = postprocess_text_bleu(decoded_preds, decoded_labels)
    decoded_preds_wer, decoded_labels_wer = postprocess_text_wer(decoded_preds, decoded_labels)

    #print("Len decoded preds: ", len(decoded_preds_wer))
    #print("Len decoded preds [0]: ", len(decoded_preds_wer[0]))
    #print("Decoded preds: ", decoded_preds_wer[0])
    #print("Decoded preds_next: ", decoded_preds_wer[1])
    #print("Decoded labels: ", decoded_labels_wer[0])

    # load metrics
    metric1, metric2, metric3 = load_metrics()

    result1 = metric1.compute(predictions=decoded_preds_bleu, references=decoded_labels_bleu)
    result2 = metric2.compute(predictions=decoded_preds_wer, references=decoded_labels_wer)
    result3 = metric3.compute(predictions=decoded_preds_wer, references=decoded_labels_wer)
    result = {"bleu": result1["score"], "wer": result2, "rougeL": result3["rougeL"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

# Train

Load the model

In [10]:
#import torch.nn as nn
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
#model= nn.DataParallel(model)
#model.to(device)

In [None]:
abspath = os.getcwd()
relpath = "data"
path = os.path.normpath(os.path.join(abspath, relpath))
relpath = "log_new"
logging_path = os.path.normpath(os.path.join(abspath, relpath))
print(logging_path)
save_path = os.path.join(abspath, os.path.join("models", "T5Large_doc_60_"+str(chunk_length)))
print(save_path)

In [12]:
with torch.no_grad():

    training_args = Seq2SeqTrainingArguments(
        output_dir=save_path,
        logging_steps=500,
        logging_dir=logging_path,
        evaluation_strategy="steps",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        auto_find_batch_size = True,  # autoset batch size to avoid memory issues with t5_large
        weight_decay=0.01,
        save_total_limit=3,  # decides, how many checkpoints will be kept at the end
        #save_strategy="steps",  # needs to be same as eval strategy in order to load best model
        #save_steps=500,  # needs to be a round multiple of eval_steps (logging_steps)
        load_best_model_at_end=True,  # keeps best model in the trainer
        num_train_epochs=3,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=True,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_data,
        eval_dataset=tokenized_test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,  # call prepare compute metrics function to pass custom args
    )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
trainer.train()



Step,Training Loss,Validation Loss,Bleu,Wer,Rougel,Gen Len
500,3.3945,2.794955,0.0,0.9837,0.0299,19.0
1000,2.9289,2.661429,0.0,0.9835,0.0316,19.0
1500,2.8332,2.590734,0.0,0.9832,0.032,19.0
2000,2.7844,2.54535,0.0,0.983,0.0319,19.0
2500,2.7165,2.515758,0.0,0.9828,0.0332,19.0
3000,2.6921,2.493007,0.0,0.9825,0.0342,19.0
3500,2.6652,2.476178,0.0,0.9822,0.0347,19.0
4000,2.6461,2.460471,0.0,0.9821,0.0351,19.0
4500,2.6351,2.449763,0.0,0.9823,0.0343,19.0
5000,2.6127,2.441545,0.0,0.9825,0.034,19.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=6762, training_loss=2.739321438175304, metrics={'train_runtime': 8395.4356, 'train_samples_per_second': 3.221, 'train_steps_per_second': 0.805, 'total_flos': 5.8540713836544e+16, 'train_loss': 2.739321438175304, 'epoch': 3.0})

# Inference

In [None]:
from transformers import pipeline
from random import shuffle
model_path = r'models\T5Large_doc_60_50'
print(model_path)
max_length = 512

In [14]:
#text = data["valid"]['shuffled'][11]
text = "water from the well they had brought up on the ground to keep down the steaming dust, the rays of the setting sun coming down over the treetops, taking a fan they handed me, tasting their pickles that were as salty as salt, and looking at a few of the young women, talking to the men.".split()
shuffle(text)
text = ' '.join(text)
text

'the a the of taking the a dust, to of talking on down had brought setting salt, salty at fan pickles rays few that the me, the tasting steaming from ground the well were young as to up their handed coming keep the treetops, looking over down women, as sun the water they they men. and'

In [8]:
translator = pipeline("translation", model=model_path, max_length=512)

In [15]:
translator(text)

[{'translation_text': 'a few of the young men were coming down from the treetops to the ground, looking over the steaming water and taking a few salty pickles from the fan, that they had brought to me to keep the dust down, as they were talking, the sun setting on the salty'}]

In [19]:
data["valid"]["orig"][11]

"them by the extreme tip of their steel - bound scabbards , he held them out towards the Frenchman . Chauvelin 's eyes were fixed upon him , and he from his towering height was looking down at the little sable - clad figure before him . The Terrorist seemed uncertain what to do . Though he was one of those men whom by the force of their intellect , the strength of their enthusiasm , the power of their cruelty , had built a new anarchical France , had overturned a throne and murdered a king , yet now"

# Inference for the whole validation Dataset

We select a subset of 5000 datapoints for inference, to cut down on computation time

In [18]:
translator = pipeline("translation", model=model_path, max_length=max_length)

In [19]:
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

In [None]:
predictions = []
x = 0
while x < len(val_data):
    out = translator(val_data[x])[0]['translation_text']
    predictions.append((source_data[x], out))
    output_df = pd.DataFrame(predictions, columns = ['orig', 'translated'])
    output_df.to_csv(r'JCLS2025_submission\gutenberg_subset\inference_results\translated_T5Large_doc_60_100.csv', sep='\t', index=False)
    print(x)
    x+=1