In [1]:
!nvidia-smi

Tue May 30 23:08:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.105.01   Driver Version: 515.105.01   CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   55C    P8    17W / 170W |      5MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
|  0%   51C    P8    11W / 170W |      5MiB / 12288MiB |      0%      Default |
|       

In [2]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ziro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
import os

# Set the CUDA device to use (e.g., device 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Downloading Model from HuggingFace Hub

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Saving the Model

In [5]:
databox_path = "/home/ziro/Desktop/Isham/Personal/text-summarizer-project/text-summarizer-project/databox"

In [6]:
model_path = os.path.join(databox_path, "models/google/pegasus-cnn_dailymail")
# tokenizer.save_pretrained(model_path)
# model_pegasus.save_pretrained(model_path)

Loaded Model from Local PC

In [7]:
tokenizer_loaded = AutoTokenizer.from_pretrained(model_path)
model_pegasus_loaded = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

Downloading Dataset

In [8]:
dataset_path = os.path.join(databox_path, "dataset")
file_link = "https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip"
dataset_path_zip = os.path.join(dataset_path, "summarizer-data.zip")

# os.system(f"wget -P {dataset_path} {file_link}")
# os.system(f"unzip {dataset_path_zip} -d {dataset_path}")

Load the Dataset

In [9]:
dataset_samsum_path = os.path.join(dataset_path, "samsum_dataset")
dataset_samsum = load_from_disk(dataset_samsum_path)
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [10]:
print("Dialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [11]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer_loaded(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer_loaded.as_target_tokenizer():
        target_encodings = tokenizer_loaded(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    

In [12]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Loading cached processed dataset at /home/ziro/Desktop/Isham/Personal/text-summarizer-project/text-summarizer-project/databox/dataset/samsum_dataset/train/cache-d2f9263d3d16af64.arrow
Loading cached processed dataset at /home/ziro/Desktop/Isham/Personal/text-summarizer-project/text-summarizer-project/databox/dataset/samsum_dataset/validation/cache-7ffe60f16789fd1c.arrow


In [13]:
dataset_samsum_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [14]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

Training the Model

In [15]:
# Before training the model, clear the gpu cache memory.
import torch
torch.cuda.empty_cache()

In [16]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer_loaded, model=model_pegasus_loaded)

In [17]:
from transformers import TrainingArguments, Trainer

output_dir = os.path.join(databox_path, "output_pegasus_samsum")

trainer_args = TrainingArguments(
    output_dir=output_dir, 
    num_train_epochs=5, 
    warmup_steps=500,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps', 
    eval_steps=500, 
    save_steps=500,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    learning_rate=3e-4,
) 

In [20]:
trainer = Trainer(
    model=model_pegasus_loaded, 
    args=trainer_args,
    tokenizer=tokenizer_loaded, 
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsum_pt["train"], 
    eval_dataset=dataset_samsum_pt["validation"]
)

In [21]:
trainer.train()

  1%|          | 10/920 [00:24<36:18,  2.39s/it]

{'loss': 2.4055, 'learning_rate': 4.8e-06, 'epoch': 0.01}


  2%|▏         | 20/920 [00:48<36:43,  2.45s/it]

{'loss': 2.2323, 'learning_rate': 1.0799999999999998e-05, 'epoch': 0.02}


  3%|▎         | 30/920 [01:12<34:53,  2.35s/it]

{'loss': 2.133, 'learning_rate': 1.6199999999999997e-05, 'epoch': 0.03}


  4%|▎         | 33/920 [01:19<35:15,  2.38s/it]

KeyboardInterrupt: 