## **Pegasus Encoder Decoder**

Imports

In [7]:
from src.training_utils import *
import json
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

In [8]:
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

print(transformers.__version__)
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" )

4.30.2


Load dataset from .json files 

In [9]:
def read_custom_json(file_path):
    # Implement your custom logic to read the JSON file and extract the data
    # For example, you can use the 'json' library or any other method you prefer
    import json
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
        # Process and extract the relevant data from the JSON file as needed
        # Create dictionaries/lists for each feature in the dataset
        # Return the dataset in the desired format
        return {
                'article':data["article"],
                'abstract':data["abstract"],
                } 

In [10]:
with open("src/data/PubMed/Train_ExtAbs_PUBMED.json") as f:
        training_corpus = json.load(f)

In [11]:
with open("src/data/PubMed/Val_ExtAbs_PUBMED.json") as f:
        validation_corpus = json.load(f)

In [13]:
from datasets import Dataset

def read_custom_json(file_path):
    # Implement your custom logic to read the JSON file and extract the data
    # For example, you can use the 'json' library or any other method you prefer
    import json
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
        # Process and extract the relevant data from the JSON file as needed
        # Create dictionaries/lists for each feature in the dataset
        # Return the dataset in the desired format
    return {'article':data["article"],
            'abstract':data["abstract"],
                } 

file_path = "src/data/PubMed/Train_ExtAbs_PUBMED.json"
data = read_custom_json(file_path)

dataset = Dataset.from_dict(data)

Load tokenizer and model

In [None]:
model_checkpoint = "google/pegasus-x-base" # Use pegasus-x-base-finetuned-xsum
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(tokenizer)
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
#print(model)

In [None]:
def tokenize_sentences(list_of_sentences, tokenizer, max_len, tensor_type = 'np'):
    ''' This function takes as input a list of sentences in the form of

        sentences = [
      "This is the first sentence.",
      "Here is the second sentence.",
      "And another sentence.",
      "A third sentence."
                    ]
    '''
    # Tokenize the sentences as a batch
    desired_length = max_len  # Desired length for padding

    batch_encoding = tokenizer.batch_encode_plus(
        list_of_sentences,
        truncation=True,
        max_length=desired_length,
        padding='max_length',
        return_tensors=tensor_type
    )

    input_ids = batch_encoding["input_ids"]
    attention_mask = batch_encoding["attention_mask"]

    return batch_encoding


Create pre-processing function

In [None]:
max_input_length = 128
max_target_length = 128

class TokenizedDataset(Dataset):
    def __init__(self,corpus,tokenizer,device):
        self.corpus = corpus
        self.num_rows = len(corpus )
        self.features = {
                'article': '',
                'abstract': '',
                'input_ids': [],
                'attention_mask': [],
                'labels': []
                } 

    def __len__(self):
        return self.num_rows
    
    def __getitem__(self, index):
        item = preprocess_function(self.corpus[index][0],tokenizer)
        texts = self.corpus[index][0]
        self.features['article'] = texts['article']
        self.features['abstract'] = texts['abstract']
        self.features['input_ids'] = item['input_ids']
        self.features['attention_mask'] =  item['attention_mask']
        self.features['labels'] =  item['labels']
                    
        return self.features
    

def preprocess_function(examples,tokenizer):
    inputs = [doc for doc in examples["article"]]
    model_inputs = tokenize_sentences(inputs, tokenizer= tokenizer, max_len=max_input_length, tensor_type='pt')#.to(device)

    # Setup the tokenizer for targets
    labels = tokenize_sentences(examples["abstract"], tokenizer = tokenizer, max_len=max_target_length, tensor_type='pt')#.to(device)

    # Ensure labels have compatible dimensions
    if isinstance(labels["input_ids"], list):
        labels["input_ids"] = [ids[0] for ids in labels["input_ids"]]  # Extract the first element
    labels["input_ids"] = [labels["input_ids"]]  # Wrap the labels array in an additional list
    
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

def map_function(corpus):
    
    tokenized_dataset = TokenizedDataset(corpus,tokenizer,device)

    return tokenized_dataset

In [None]:
tokenized_dataset_train = map_function(training_corpus)
tokenized_dataset_val = map_function(validation_corpus)
tokenized_datasets = {"train":tokenized_dataset_train, "validation":tokenized_dataset_val}

In [None]:
print(tokenized_dataset_train[0])
print(len(tokenized_dataset_train))
print(tokenized_dataset_train[0].keys())

Load the metric

In [None]:
from evaluate import load
metric = load("rouge")
#print(metric)

## **TRAINING**

In [None]:
batch_size = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    "Pegasus-finetuned",
    evaluation_strategy = "epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate = True,
    fp16=True,
    gradient_accumulation_steps=512,
    logging_steps=1,
    label_smoothing_factor = 0.1, 
    #auto_find_batch_size = True,
)

Custom data collator

In [None]:
# import torch
# from torch.nn.utils.rnn import pad_sequence

# class CustomDataCollatorForSeq2Seq:
#     def __init__(self, tokenizer, model):
#         self.tokenizer = tokenizer
#         self.model = model

#     def __call__(self, examples):
#         input_ids = [example['input_ids'] for example in examples]
#         attention_mask = [example['attention_mask'] for example in examples]
#         labels = [example['labels'] for example in examples]

#         # Pad input and attention masks
#         padded_input_ids = pad_sequence(input_ids, batch_first=True)
#         padded_attention_mask = pad_sequence(attention_mask, batch_first=True)
        
#         # Handle dimension mismatch in labels
#         max_label_len = max(len(label) for label in labels)
#         padded_labels = [
#             torch.cat((label, torch.tensor([self.tokenizer.pad_token_id] * (max_label_len - len(label)))))
#             for label in labels
#         ]
        
#         return {
#             'input_ids': padded_input_ids,
#             'attention_mask': padded_attention_mask,
#             'labels': torch.stack(padded_labels),
#         }

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
#data_collator = CustomDataCollatorForSeq2Seq(tokenizer, model=model)

import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    wandb.log({'rouge1': result['rouge1'], 'rouge2': result['rouge2'], 'rougeL': result['rougeL'], 'rougeLsum': result['rougeLsum']})
    

    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    
)
for param in trainer.model.model.encoder.parameters():
    param.requires_grad = False

In [None]:
trainer.train()
wandb.finish()

In [None]:
#model_checkpoint = "checkpoint-14500-finetuned_alot/checkpoint-29500"

Already fine-tuned on pubmed

In [None]:
#model_checkpoint = "google/pegasus-pubmed"

In [None]:
#model_checkpoint = "Kevincp560/pegasus-arxiv-finetuned-pubmed"

In [None]:
#model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# # import pegasus
# import torch
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# tokenizer = AutoTokenizer.from_pretrained("google/pegasus-pubmed")

# ARTICLE_TO_SUMMARIZE = ["Introduction: Cognitive decline is a common age-related phenomenon, and interventions to mitigate its effects are of great interest. \
#                         Exercise has been suggested as a potential strategy to improve cognitive function in older adults. This study aimed to investigate the effects\
#                          of a structured exercise program on cognitive function in elderly individuals.\
# Methods: A randomized controlled trial was conducted with 60 participants aged 65 and above.\
#                         The participants were randomly assigned to either an exercise group or a control group.\
#                         The exercise group underwent a 12-week exercise program consisting of aerobic exercises, strength training, \
#                         and flexibility exercises. The control group maintained their usual daily activities without \
#     any structured exercise intervention. Cognitive function was assessed using standardized neuropsychological tests at baseline and after the intervention.\
# Results: The results revealed significant improvements in cognitive function in the exercise group compared to the control group. \
#                         The exercise group demonstrated enhanced performance in various cognitive domains, including attention, memory, and executive function. \
#                         These improvements were statistically significant and clinically meaningful. Furthermore, the exercise group showed a significant \
#                         reduction in the risk of cognitive decline compared to the control group.\
# Conclusion: This randomized controlled trial provides evidence that a structured exercise program can have positive effects on cognitive function in elderly adults.\
#       Regular physical exercise, including aerobic exercises, strength training, and flexibility exercises, may serve as a valuable intervention to\
#                         promote cognitive health and reduce the risk of cognitive decline in the aging population.",
#                         "Hi my name is jackie jack jack joock"]

# inputs = tokenize_sentences(ARTICLE_TO_SUMMARIZE, tokenizer=tokenizer, max_len=256, tensor_type = 'pt')

# # Generate Summary
# summary_ids = model.generate(inputs["input_ids"].to(device)).detach()
# output = tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
#                                 clean_up_tokenization_spaces=False)
# print(output)

# print(len(output))
