In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import torch

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
data  = pd.read_csv("dataset.csv")

In [5]:
train_df, val_df = train_test_split(data, test_size=0.2)

In [6]:
class SummarizationDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_len=1024, max_output_len=128):
        self.tokenizer = tokenizer
        self.inputs = df['text'].tolist()
        self.targets = df['summary'].tolist()
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        inputs = self.tokenizer(input_text, max_length=self.max_input_len, padding="max_length", truncation=True, return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=self.max_output_len, padding="max_length", truncation=True, return_tensors="pt")

        return {
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'labels': targets.input_ids.squeeze()
        }

In [8]:
tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-summarization").to(device)
# 데이터셋 생성
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [10]:
training_args = TrainingArguments(
    output_dir='/summary',                # 결과가 저장될 경로
    num_train_epochs=3,                    # 학습할 에폭 수
    per_device_train_batch_size=4,         # 학습 시 배치 크기
    per_device_eval_batch_size=4,          # 평가 시 배치 크기
    warmup_steps=500,                      # 학습 초기에 학습률을 천천히 증가시키는 단계 수
    weight_decay=0.01,                     # 가중치 감쇠를 위한 값
    logging_dir='/summary/logs',                  # 로그가 저장될 경로
    logging_steps=10,                      # 로그를 기록할 스텝 수
    evaluation_strategy="epoch",           # 평가 전략
    save_strategy="epoch",                 # 체크포인트 저장 전략
    save_total_limit=2,                     # 저장할 체크포인트의 최대 수
    # device=device
)

In [11]:
# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("./kobart-summarization-finetuned")
tokenizer.save_pretrained("./kobart-summarization-finetuned")

In [None]:
input_text = """
40억 달러 ‘딜’ 주인공 김봉진 우아한형제들 대표태풍 뒤의 고요함이랄까. ...
"""

# 입력 텍스트를 토크나이즈
inputs = tokenizer(input_text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

# 모델을 사용해 요약 생성
summary_ids = model.generate(inputs.input_ids, max_length=128, num_beams=4, early_stopping=True)

# 생성된 요약 디코딩
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summary)