In [1]:
import pandas as pd
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
df  = pd.read_csv("dataset.csv")

In [3]:
df = df.head(100000)

In [4]:
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [5]:
def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    
    # 생성할 레이블을 인코딩하고, max_length로 패딩
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 100000/100000 [03:54<00:00, 426.80 examples/s]


In [8]:
import torch
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = torch.nn.DataParallel(model)

# 모델을 GPU로 이동
model = model.to("cuda")

Using 2 GPUs


In [9]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.3)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# 데이터셋을 PyTorch 형식으로 변환
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [10]:
import os

# 로그 디렉토리 생성 및 확인
log_dir = "./results"
os.makedirs(log_dir, exist_ok=True)

if not os.path.exists(log_dir):
    raise Exception(f"Log directory {log_dir} was not created successfully")

print(f"Log directory {log_dir} is ready")

Log directory ./results is ready


In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir='/summary/results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    remove_unused_columns=False,
    logging_dir='/summary/logs',
    logging_steps=10
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)


In [None]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,0.753,No log


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

In [None]:
trainer.evaluate()

In [None]:
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# 예시 문서 요약
example_text = "배달의민족이 독일 자본에 매각된 것을 놓고 말들이 많다.\n 민족 정서를 배반했다..."
print(summarize(example_text))

In [None]:
trainer.save_model('./saved_model')

# 토크나이저 저장
tokenizer.save_pretrained('./saved_model')