In [1]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

from datasets import load_metric, Dataset
metric = load_metric("rouge")
from bert_score import score

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

MIN_ALLOWED_SEQUENCE = 30
MAX_ALLOWED_SEQUENCE = 1024
BATCH_SIZE = 1
ACCUMULATION_STEPS = 4
LEARNING_RATE = 1e-5
EPOCHS = 3
NUM_BEAMS = 4

data_collator = DataCollatorForSeq2Seq(tokenizer)

In [3]:
model.config.max_length = MAX_ALLOWED_SEQUENCE

In [4]:
def preprocess_text(text):
    text = text.lower()
    
    tokens = word_tokenize(text, language='english')
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

def preprocess_function(examples):
    references = ["summarize: " + ref for ref in examples["reference"]]
    
    inputs = tokenizer(references, truncation=True, max_length=MAX_ALLOWED_SEQUENCE)
    targets = tokenizer(examples["summary"], truncation=True, max_length=MAX_ALLOWED_SEQUENCE)

    # Update examples with tokenized inputs and targets
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

In [5]:
train_df = pd.read_csv("train_processed.csv")
valid_df = pd.read_csv("validation_processed.csv")

train_df = train_df[train_df['reference_tokens'] < MAX_ALLOWED_SEQUENCE].reset_index(drop=True)
valid_df = valid_df[valid_df['reference_tokens'] < MAX_ALLOWED_SEQUENCE].reset_index(drop=True)

train_df["summary"] = train_df["summary"].apply(preprocess_text)
valid_df["summary"] = valid_df["summary"].apply(preprocess_text)

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [7]:
args = Seq2SeqTrainingArguments(
    output_dir="./my_fine_tuned_t5_base_model",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    gradient_accumulation_steps=ACCUMULATION_STEPS,
    eval_accumulation_steps=ACCUMULATION_STEPS,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,No log,4.143132,14.3659,6.7076,10.1439,13.7787,64.5714
1,No log,3.494305,18.5535,7.2672,12.8819,17.4425,225.1429
2,No log,3.405851,20.3518,8.0149,13.7689,19.072,236.5714


Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}


TrainOutput(global_step=162, training_loss=5.149669505931713, metrics={'train_runtime': 5789.698, 'train_samples_per_second': 0.113, 'train_steps_per_second': 0.028, 'total_flos': 502460432593920.0, 'train_loss': 5.149669505931713, 'epoch': 2.958904109589041})

In [9]:
test_df = pd.read_csv("test_processed.csv")
test_df = test_df[test_df['reference_tokens'] < MAX_ALLOWED_SEQUENCE].reset_index(drop=True)

In [10]:
device = torch.device("cuda")
model_trained = AutoModelForSeq2SeqLM.from_pretrained("./my_fine_tuned_t5_base_model/checkpoint-162").to(device)

for index, row in test_df.iterrows():
    inputs = tokenizer("summarize: " + row["reference"], max_length=MAX_ALLOWED_SEQUENCE, truncation=True, return_tensors="pt").to(device)
    outputs = model_trained.generate(**inputs, min_length=MIN_ALLOWED_SEQUENCE, max_length=MAX_ALLOWED_SEQUENCE,\
                                     num_beams=NUM_BEAMS, early_stopping=True)
    result_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(result_summary)

    P, R, F1 = score([result_summary], [row["summary"]], lang='en', verbose=False)
    print(f"T5 BertScore F1: {F1.item():.2f}")
    torch.cuda.empty_cache()

the strategic innovation agenda of the european institute of innovation and technology for the period from 2021 to 2027 ( sia 2021-2027) shall be implemented in accordance with regulation ( eu ) 2021/819. article 3 decision no 1312/2013/eu is repealed with effect from 1 january 2021.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5 BertScore F1: 0.82
the commission shall conduct an impact assessment on the continued issuance of 1- and 2-cent coin. the ceiling may be raised to 2,0 % of the cumulated total net number of 2-euro coin put into circulation by all member state whose currency is the euro up to the beginning of the year preceding the year of issuance of the commemorative coin. the identity of the issuing member state shall be clearly and easily recognisable on the coin.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5 BertScore F1: 0.82


In [11]:
del model_trained
device = torch.device("cuda")
model_untrained = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-large").to(device)

for index, row in test_df.iterrows():
    inputs = tokenizer("summarize: " + row["reference"], max_length=MAX_ALLOWED_SEQUENCE, truncation=True, return_tensors="pt").to(device)
    outputs = model_untrained.generate(**inputs, min_length=MIN_ALLOWED_SEQUENCE, max_length=MAX_ALLOWED_SEQUENCE,\
                                       num_beams=NUM_BEAMS, early_stopping=True)
    result_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(result_summary)

    P, R, F1 = score([result_summary], [row["summary"]], lang='en', verbose=False)
    print(f"T5 BertScore F1: {F1.item():.2f}")
    torch.cuda.empty_cache()

sia 2021-2027 shall be implemented in accordance with regulation ( eu ) 2021/819. decision no 1312/2013/eu is repealed with effect from 1 january 2021.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5 BertScore F1: 0.79
circulation coin means euro coin intended for circulation. commemorative coin means circulation coin intended to commemorate a specific subject. collector coin means euro coin intended for collection that are not issued with a view to their entry into circulation. member state may issue two commemorative coin per year.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5 BertScore F1: 0.81
