## **Pegasus Encoder Decoder**

Imports

In [None]:
from src.training_utils import *
import json
import torch
from torch.utils.data import Dataset

In [None]:
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

print(transformers.__version__)

Load dataset from .json files 

In [None]:
with open("src/data/PubMed/Train_ExtAbs_PUBMED.json") as f:
        training_corpus = json.load(f)

In [None]:
with open("src/data/PubMed/Val_ExtAbs_PUBMED.json") as f:
        validation_corpus = json.load(f)

Load tokenizer and model

In [None]:
model_checkpoint = "google/pegasus-x-base" # Use pegasus-x-base-finetuned-xsum
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(tokenizer)
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
print(model)

Create pre-processing function

In [None]:
class TokenizedDataset(Dataset):
    def __init__(self, corpus):
        self.corpus = corpus
        self.num_rows = len(corpus)

    def __len__(self):
        return self.num_rows

    def __getitem__(self, index):
        example = self.corpus[index][0]
        processed_example = self.preprocess_function(example)
        return processed_example

    @staticmethod
    def preprocess_function(example):
        max_input_length = 512
        max_target_length = 256
        inputs = [doc for doc in example["article"]]
        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
        print(model_inputs)

        # Setup the tokenizer for targets
        labels = tokenizer(text_target=example["abstract"], max_length=max_target_length, truncation=True)
        #print(labels)


        return {"article": example["article"], 
                "abstract": example["abstract"], 
                "input_ids" : labels['input_ids'], 
                "attention_mask": model_inputs['attention_mask'],
                "labels": model_inputs['labels']}
    
def map_function(corpus):
    
    tokenized_dataset = TokenizedDataset(corpus)

    return tokenized_dataset

In [None]:
# max_input_length = 512
# max_target_length = 256

# class TokenizedDataset(Dataset):
#     def __init__(self,corpus):
#         self.corpus = corpus
#         self.features = features
        
#     def __len__(self):
#         return len(self.features)
    
#     def __getitem__(self, index):
#         item = preprocess_function(self.corpus[index][0])
#         texts = self.corpus[index][0]
        
#         features = {texts['article'],
#                     texts['abstract'],
#                     item['input_ids'], 
#                     item['attention_mask'],
#                     item['labels']}
#         return features
    

# def preprocess_function(examples):
#     inputs = [doc for doc in examples["article"]]
#     model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

#     # Setup the tokenizer for targets
#     labels = tokenizer(text_target=examples["abstract"], max_length=max_target_length, truncation=True)
#     model_inputs["labels"] = labels["input_ids"]

#     return model_inputs

# def map_function(corpus):
    
#     tokenized_dataset = TokenizedDataset(corpus)

#     return tokenized_dataset

In [None]:
tokenized_dataset_train = map_function(training_corpus)
tokenized_dataset_val = map_function(validation_corpus)
tokenized_datasets = {"train":tokenized_dataset_train, "validation":tokenized_dataset_val}

In [None]:
tokenized_dataset_train[0]
print(len(tokenized_dataset_train))

In [None]:
from datasets import load_dataset

dataset = load_dataset('ccdv/pubmed-summarization')
dataset["train"]

In [None]:
token = dataset["validation"].map(preprocess_function, batched=True)
token


Load the metric

In [None]:
from evaluate import load
metric = load("rouge")
print(metric)

## **TRAINING**

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    "Pegasus-finetuned",
    evaluation_strategy = "epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate = True,
    fp16=True,
    gradient_accumulation_steps=512,
    logging_steps=1,
    label_smoothing_factor = 0.1, 
    auto_find_batch_size = True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    wandb.log({'rouge1': result['rouge1'], 'rouge2': result['rouge2'], 'rougeL': result['rougeL'], 'rougeLsum': result['rougeLsum']})
    

    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    
)
for param in trainer.model.model.encoder.parameters():
    param.requires_grad = False

In [None]:
trainer.train()
wandb.finish()

In [None]:
model_checkpoint = "checkpoint-14500-finetuned_alot/checkpoint-29500"

Already fine-tuned on pubmed

In [None]:
model_checkpoint = "google/pegasus-pubmed"

In [None]:
model_checkpoint = "Kevincp560/pegasus-arxiv-finetuned-pubmed"

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# import pegasus
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-pubmed")


ARTICLE_TO_SUMMARIZE = "Researchers at a leading university have developed a groundbreaking technology that could revolutionize renewable energy generation. The new system, known as 'SolarWave,' harnesses the power of ocean waves to generate electricity. By utilizing a network of specialized buoys equipped with advanced turbines, the technology can convert the kinetic energy from the waves into clean, sustainable power. This innovation has the potential to significantly contribute to the global efforts in combating climate change and reducing our reliance on fossil fuels. It's an exciting development that could reshape the future of renewable energy."
inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt", truncation=True)

# Generate Summary
summary_ids = model.generate(inputs["input_ids"].to(device))
output = tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False)[0]
print(output)

print(len(output))