In [3]:
!pip install transformers datasets wandb rouge_score accelerate pytorch-lightning peft tensorboardX

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-2.2.5-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.3/802.3 kB[0m [31m43.5 MB/s

In [4]:
import os
from tqdm.auto import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorWithPadding, AdamW
from datasets import load_dataset

import random
import pandas as pd
from rouge_score import rouge_scorer
from tensorboardX import SummaryWriter

# Initialize TensorBoard
writer = SummaryWriter()

In [7]:
# 데이터셋 로드
train_dataset = load_dataset('cnn_dailymail', "3.0.0", split='train')
valid_dataset = load_dataset('cnn_dailymail', "3.0.0", split="validation")

# 모델과 토크나이저 로드
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# # 모델에 EOS 토큰 추가
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [8]:
## -- 데이터셋 불러오기 및 토큰화 -- ##
class CNNDataset(Dataset):
    def __init__(self, item, tokenizer, max_length):
        self.item = item
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.item)

    def __getitem__(self, idx):
        article = self.item['article'][idx]
        inputs = self.tokenizer(article, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        summary = self.item['highlights'][idx]  # Use highlights as summary
        targets = self.tokenizer(summary, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

In [9]:
class Dataloader(pl.LightningDataModule):
    def __init__(self, tokenizer, batch_size, max_length):
        super().__init__()
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = tokenizer

    def setup(self, stage='fit'):
        ## -- Dataloader 설정 -- ##
        train_dataset = pd.DataFrame(load_dataset('cnn_dailymail', '3.0.0', split='train[:1%]')).reset_index(drop=True)
        valid_dataset = pd.DataFrame(load_dataset('cnn_dailymail', '3.0.0', split='validation[:1%]'))
        test_dataset = pd.DataFrame(load_dataset('cnn_dailymail', '3.0.0', split='test[:1%]'))

        # prepare dataset
        self.train_dataset = CNNDataset(train_dataset, self.tokenizer, self.max_length)
        self.valid_dataset = CNNDataset(valid_dataset, self.tokenizer, self.max_length)
        self.test_dataset = CNNDataset(test_dataset, self.tokenizer, self.max_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.valid_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [10]:

class Model(pl.LightningModule):
    def __init__(
                self,
                model,
                lr: float=1e-5,
                ):

        super().__init__()
        self.save_hyperparameters()
        self.model = model
        self.lr = lr


        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        inputs, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # Decode predictions and labels
        preds = self.trainer.datamodule.tokenizer.decode(outputs.logits.argmax(dim=-1)[0], skip_special_tokens=True)
        target = self.trainer.datamodule.tokenizer.decode(labels[0], skip_special_tokens=True)
        # Calculate ROUGE scores
        rouge_result = self.scorer.score(target, preds)
        rouge1 = rouge_result['rouge1'].fmeasure
        rouge2 = rouge_result['rouge2'].fmeasure
        rougeL = rouge_result['rougeL'].fmeasure
        self.log_dict({"val_loss": loss, "rouge1": rouge1, "rouge2": rouge2, "rougeL": rougeL},
                      on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        inputs, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # Decode predictions and labels
        preds = self.trainer.datamodule.tokenizer.decode(outputs.logits.argmax(dim=-1)[0], skip_special_tokens=True)
        target = self.trainer.datamodule.tokenizer.decode(labels[0], skip_special_tokens=True)
        # Calculate ROUGE scores
        rouge_result = self.scorer.score(target, preds)
        rouge1 = rouge_result['rouge1'].fmeasure
        rouge2 = rouge_result['rouge2'].fmeasure
        rougeL = rouge_result['rougeL'].fmeasure
        self.log_dict({"test_loss": loss, "rouge1": rouge1, "rouge2": rouge2, "rougeL": rougeL},
                      on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.model.parameters(), lr=self.lr)

In [None]:
## -- 하이퍼파라미터 설정 -- ##
max_length = 512
learning_rate = 5e-5
max_epoch = 3
batch_size = 16
model_name = 'gpt2'
seed = 42
gpu_id = '0,'

# Set random seed for reproducibility
pl.seed_everything(seed)

# 모델에 EOS 토큰 추가
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
pt_model = GPT2LMHeadModel.from_pretrained(model_name)

# # 모델에 EOS 토큰 추가
tokenizer.pad_token = tokenizer.eos_token
pt_model.config.pad_token_id = tokenizer.eos_token_id

# Initialize the dataloader
dataloader = Dataloader(tokenizer, batch_size, max_length)
dataloader.setup()

# Initialize the model
model = Model(pt_model, lr=learning_rate)

# Configure the PyTorch Lightning Trainer
checkpoint_callback = ModelCheckpoint(
    dirpath='./checkpoints/Summerization/',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

trainer = pl.Trainer(
                    accelerator='gpu',
                    devices=gpu_id,
                    max_epochs=max_epoch,
                    num_sanity_val_steps=0,
                    callbacks=[checkpoint_callback]
                    )

# Train the model
trainer.fit(model=model, datamodule=dataloader)

model.eval()
trainer.test(model=model, datamodule=dataloader)

INFO:lightning_fabric.utilities.seed:Seed set to 42


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/parsing.py:199: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size