In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np

def prepare_dataset(file_path):
    """
    file_path: 데이터를 포함한 CSV 파일 경로
    파일은 'input'과 'output' 열을 포함해야 합니다.
    """
    # utf-8-sig 또는 다른 인코딩 방식으로 데이터 읽기
    try:
        data = pd.read_csv(file_path, encoding='utf-8-sig')  # utf-8-sig로 시도
    except UnicodeDecodeError:
        data = pd.read_csv(file_path, encoding='ISO-8859-1')  # ISO-8859-1로 다시 시도

    dataset = Dataset.from_pandas(data)
    return dataset

# 사용자 정의 데이터 경로
data_file = "./arxiv_abstracts_translated.csv"  # 전체 데이터 경로

data = prepare_dataset(data_file)

# 데이터셋 분할 (7:1:2 비율로 train, test, validation)
def split_dataset(dataset, train_ratio=0.7, test_ratio=0.1, seed=123):
    """
    dataset: 원본 데이터셋
    train_ratio: 학습 데이터 비율
    test_ratio: 테스트 데이터 비율
    나머지는 검증 데이터 비율로 계산
    """
    shuffled = dataset.shuffle(seed=seed)
    total_size = len(shuffled)
    train_size = int(total_size * train_ratio)
    test_size = int(total_size * test_ratio)

    train_dataset = shuffled.select(range(train_size))
    test_dataset = shuffled.select(range(train_size, train_size + test_size))
    valid_dataset = shuffled.select(range(train_size + test_size, total_size))

    return DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
        "validation": valid_dataset
    })

dataset = split_dataset(data)

# 2. 토크나이저 및 모델 로드
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# 3. 데이터셋 전처리
def preprocess_function(examples):
    """
    데이터를 T5 입력 형식으로 변환
    """
    inputs = examples["input"]  # 'input' 열 사용
    targets = examples["output"]  # 'output' 열 사용
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=1024, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels

    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 데이터셋 셔플링
tokenized_datasets["train"] = tokenized_datasets["train"].shuffle(seed=42)

# 4. 학습 설정
training_args = TrainingArguments(
    output_dir="./results",  # 출력 디렉토리
    evaluation_strategy="epoch",  # 평가 주기
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,  # 체크포인트 제한
    save_strategy="epoch",
    logging_dir="./logs",  # 로깅 디렉토리
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",  # 검증 기준
)

# 5. 트레이너 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# 6. 학습 시작
trainer.train()

# 7. 모델 저장
trainer.save_model("./t5-custom-model")
tokenizer.save_pretrained("./t5-custom-model")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 244/244 [00:00<00:00, 1418.13 examples/s]
Map: 100%|██████████| 35/35 [00:00<00:00, 1427.58 examples/s]
Map: 100%|██████████| 71/71 [00:00<00:00, 1577.32 examples/s]
  0%|          | 0/93 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


RuntimeError: MPS backend out of memory (MPS allocated: 17.85 GB, other allocations: 272.78 MB, max allowed: 18.13 GB). Tried to allocate 256.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [9]:
# 8. 모델 로드 및 테스트 함수 구현
def load_and_test_model(dataset):
    """
    dataset: 테스트에 사용할 데이터셋 (Dataset 객체)
    """
    model = T5ForConditionalGeneration.from_pretrained("./t5-custom-model")
    tokenizer = T5Tokenizer.from_pretrained("./t5-custom-model")
    metric = evaluate.load("accuracy")

    def predict_function(examples):
        inputs = tokenizer(["fix: " + i for i in examples["input"]], return_tensors="pt", max_length=1024, padding="max_length", truncation=True)
        predictions = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        return {"predictions": decoded_predictions}

    predictions = dataset.map(predict_function, batched=True, batch_size=8)
    references = predictions["output"]
    results = metric.compute(predictions=predictions["predictions"], references=references)
    print(f"Accuracy: {results['accuracy']}")

    return results["accuracy"]  # 정확도 반환


if __name__ == "__main__":
    test_sentence = "I is a good boy." # 여기에 변화를 줄것
    corrected = load_and_test_model(tokenized_datasets["test"])  # 테스트 데이터셋 사용
    print("입력 문장:", test_sentence)
    print("Accuracy on test data:", corrected)

OSError: Incorrect path_or_model_id: './t5-custom-model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.