In [1]:
!pip install transformers datasets wandb rouge_score accelerate pytorch-lightning peft tensorboardX

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-2.2.5-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.3/802.3 kB[0m [31m32.4 MB/s

In [2]:
import gc
import torch
from torch.utils.data import DataLoader, Dataset

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from tensorboardX import SummaryWriter

# Initialize TensorBoard
writer = SummaryWriter()

In [3]:
## -- 데이터셋 불러오기 및 토큰화 -- ##
class CNNDataset(Dataset):
    def __init__(self, item, tokenizer, max_length):
        self.item = item
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.item)

    def __getitem__(self, idx):
        article = self.item['article'][idx]
        inputs = self.tokenizer(article, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        summary = self.item['highlights'][idx]  # Use highlights as summary
        targets = self.tokenizer(summary, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

In [4]:
train_dataset = load_dataset("cnn_dailymail", "3.0.0", split='train[:1%]')
valid_dataset = load_dataset("cnn_dailymail", "3.0.0", split='validation[:1%]')
test_dataset = load_dataset("cnn_dailymail", "3.0.0", split='test[:1%]')

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
train_dataset = CNNDataset(train_dataset, tokenizer, 512)
valid_dataset = CNNDataset(valid_dataset, tokenizer, 512)
test_dataset = CNNDataset(test_dataset, tokenizer, 512)


# Data collator to dynamically pad the inputs to the maximum length of the batch
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    dataloader_num_workers=4,
    report_to="none",
    run_name="gpt2-finetune-cnn",  # Name of the Wandb run
    evaluation_strategy="epoch"  # Evaluate at the end of each epoch
)



In [9]:
# Check if CUDA is available and use it
if torch.cuda.is_available():
    model.cuda()

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Train the model
trainer.train()

# Release GPU memory
del model
del trainer
torch.cuda.empty_cache()
gc.collect()

  self.pid = os.fork()


Epoch,Training Loss,Validation Loss


  self.pid = os.fork()


KeyboardInterrupt: 

In [None]:
# Save the model and tokenizer
model.save_pretrained("./results/latest_checkpoint")
tokenizer.save_pretrained("./results/latest_checkpoint")

In [None]:
# Load the saved model and tokenizer for evaluation
model = GPT2LMHeadModel.from_pretrained("./results/latest_checkpoint")
tokenizer = GPT2Tokenizer.from_pretrained("./results/latest_checkpoint")

# Move model to GPU if available
if torch.cuda.is_available():
    model.cuda()

# Evaluation on the test dataset
rouge = load_metric("rouge")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits, labels = torch.tensor(logits), torch.tensor(labels)

    predictions = torch.argmax(logits, dim=-1)
    # Remove all special tokens including padding and None values from labels
    labels = [[token for token in label if token != -100] for label in labels.tolist()]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline after each sentence
    decoded_preds = ["\n".join(pred.strip() for pred in preds.split(". ")) for preds in decoded_preds]
    decoded_labels = ["\n".join(label.strip() for label in labels.split(". ")) for labels in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {key: value.mid.fmeasure * 100 for key, value in result.items()}

# Trainer for evaluation
test_trainer = Trainer(
    model=model,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Evaluate the model on the test dataset
test_results = test_trainer.predict(test_dataset)

# Print the results
print("Test ROUGE Scores:", test_results.metrics)

# Release GPU memory
del model
del test_trainer
torch.cuda.empty_cache()
gc.collect()