In [1]:
import sys
from torch import squeeze
sys.path.append('..')

from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments

from datasets import load_dataset

tokenizer = None

2023-11-17 16:41:48.953592: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-17 16:41:48.953648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-17 16:41:48.977681: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def preprocess_function(examples):
    global tokenizer

    if tokenizer is None:
        getTokenizer("google/pegasus-large")

    data = {}
    data["summary"] = tokenizer(examples["abstract"], max_length=512, truncation=True, padding='max_length')
    data["paper"] = tokenizer(examples["article"], max_length=512, truncation=True, padding='max_length')

    return  {
        "input_ids" : data["paper"]["input_ids"],
        "attention_mask": data["paper"]["attention_mask"],  
        "labels": data["summary"]["input_ids"],
    }

In [3]:
def getTokenizer(modelName):
    global tokenizer
    # Tokenize the dataset using the same tokenizer
    tokenizer = PegasusTokenizer.from_pretrained(modelName)
    print("Tokenizer loaded")
    return tokenizer

In [4]:
def loadModel(modelName):
    # Load pre-trained Pegasus model and tokenizer
    # model_name = "google/pegasus-large"
    model = PegasusForConditionalGeneration.from_pretrained(modelName)

    print("Model loaded")

    if ( model is None ):
        print("Model not found")
        exit(0)

    return model

In [5]:
def prepareDataset(datasetName):
    # Using the dataset
    datasetName = "ccdv/arxiv-summarization"
    dataset = load_dataset(datasetName)

    print("Dataset loaded")
    
    print(dataset)
    processedDataset = {}
    #Tokenizing the dataset
    for split in ["train", "validation"]:
        processedDataset[split] = dataset[split].map(preprocess_function, batched=True, num_proc=10, remove_columns=["abstract", "article"])

    # return dataset
    print("Dataset tokenized")
    return processedDataset


In [6]:
def setTrainingArguments():
    # Define training arguments
    return TrainingArguments(
        output_dir="./output",                 # Output directory for model checkpoints and predictions
        overwrite_output_dir=True,             # Overwrite the content of the output directory
        num_train_epochs=3,                    # Number of training epochs
        per_device_train_batch_size=1,         # Batch size per GPU
        save_steps=10_000,                     # Save checkpoint every X steps
        save_total_limit=3,                    # Limit the total amount of checkpoints to save
        evaluation_strategy="steps",           # Evaluate and save checkpoint every eval_steps
        eval_steps=500,                        # Number of update steps between two evaluations
        logging_steps=500,                     # Log training information every X steps
        learning_rate=2e-5,                    # Learning rate
        gradient_accumulation_steps=4,         # Number of steps to accumulate gradients
        warmup_steps=500,                      # Number of steps for linear warmup
        weight_decay=0.01,                     # Weight decay for regularization
        logging_dir="./logs",                  # Directory for Tensorboard logs
    )

In [7]:
def trainModel(model, tokenized_datasets, training_args):
    # Fine-tune the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
    )
    print("Starting training")

    # Run the code to train the model
    trainer.train()


In [8]:
pegasusBase = loadModel("google/pegasus-large")
pegasusTokenizer = getTokenizer("google/pegasus-large")
tokenizedDataset = prepareDataset("ccdv/arxiv-summarization")
trainingArgs = setTrainingArguments()

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded
Tokenizer loaded
Dataset loaded
DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})
Dataset tokenized


In [None]:
trainModel(pegasusBase, tokenizedDataset, trainingArgs)

Starting training


Step,Training Loss,Validation Loss
