In [None]:
!nvidia-smi

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
import os

# Set the CUDA device to use (e.g., device 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Downloading Model from HuggingFace Hub

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Saving the Model

In [None]:
databox_path = "/home/ziro/Desktop/Isham/Personal/text-summarizer-project/text-summarizer-project/databox"

In [None]:
model_path = os.path.join(databox_path, "models/google/pegasus-cnn_dailymail")
# tokenizer.save_pretrained(model_path)
# model_pegasus.save_pretrained(model_path)

Loaded Model from Local PC

In [None]:
tokenizer_loaded = AutoTokenizer.from_pretrained(model_path)
model_pegasus_loaded = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

Downloading Dataset

In [None]:
dataset_path = os.path.join(databox_path, "dataset")
file_link = "https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip"
dataset_path_zip = os.path.join(dataset_path, "summarizer-data.zip")

# os.system(f"wget -P {dataset_path} {file_link}")
# os.system(f"unzip {dataset_path_zip} -d {dataset_path}")

Load the Dataset

In [None]:
dataset_samsum_path = os.path.join(dataset_path, "samsum_dataset")
dataset_samsum = load_from_disk(dataset_samsum_path)
dataset_samsum

In [None]:
print("Dialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer_loaded(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer_loaded.as_target_tokenizer():
        target_encodings = tokenizer_loaded(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

In [None]:
dataset_samsum_pt

In [None]:
dataset_samsum_pt["train"]

Training the Model

In [None]:
# Before training the model, clear the gpu cache memory.
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer_loaded, model=model_pegasus_loaded)

In [None]:
from transformers import TrainingArguments, Trainer

output_dir = os.path.join(databox_path, "output_pegasus_samsum")

trainer_args = TrainingArguments(
    output_dir=output_dir, 
    num_train_epochs=5, 
    warmup_steps=500,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps', 
    eval_steps=500, 
    save_steps=500,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    learning_rate=3e-4,
) 

In [None]:
trainer = Trainer(
    model=model_pegasus_loaded, 
    args=trainer_args,
    tokenizer=tokenizer_loaded, 
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsum_pt["train"], 
    eval_dataset=dataset_samsum_pt["validation"]
)

In [None]:
trainer.train()