In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np

def prepare_dataset(file_path):
    """
    file_path: Îç∞Ïù¥ÌÑ∞Î•º Ìè¨Ìï®Ìïú CSV ÌååÏùº Í≤ΩÎ°ú
    ÌååÏùºÏùÄ 'input'Í≥º 'output' Ïó¥ÏùÑ Ìè¨Ìï®Ìï¥Ïïº Ìï©ÎãàÎã§.
    """
    # utf-8-sig ÎòêÎäî Îã§Î•∏ Ïù∏ÏΩîÎî© Î∞©ÏãùÏúºÎ°ú Îç∞Ïù¥ÌÑ∞ ÏùΩÍ∏∞
    try:
        data = pd.read_csv(file_path, encoding='utf-8-sig')  # utf-8-sigÎ°ú ÏãúÎèÑ
    except UnicodeDecodeError:
        data = pd.read_csv(file_path, encoding='ISO-8859-1')  # ISO-8859-1Î°ú Îã§Ïãú ÏãúÎèÑ

    dataset = Dataset.from_pandas(data)
    return dataset

# ÏÇ¨Ïö©Ïûê Ï†ïÏùò Îç∞Ïù¥ÌÑ∞ Í≤ΩÎ°ú
data_file = "./arxiv_abstracts_translated.csv"  # Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ Í≤ΩÎ°ú

data = prepare_dataset(data_file)

# Îç∞Ïù¥ÌÑ∞ÏÖã Î∂ÑÌï† (7:1:2 ÎπÑÏú®Î°ú train, test, validation)
def split_dataset(dataset, train_ratio=0.7, test_ratio=0.1, seed=123):
    """
    dataset: ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ÏÖã
    train_ratio: ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ ÎπÑÏú®
    test_ratio: ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ ÎπÑÏú®
    ÎÇòÎ®∏ÏßÄÎäî Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ ÎπÑÏú®Î°ú Í≥ÑÏÇ∞
    """
    shuffled = dataset.shuffle(seed=seed)
    total_size = len(shuffled)
    train_size = int(total_size * train_ratio)
    test_size = int(total_size * test_ratio)

    train_dataset = shuffled.select(range(train_size))
    test_dataset = shuffled.select(range(train_size, train_size + test_size))
    valid_dataset = shuffled.select(range(train_size + test_size, total_size))

    return DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
        "validation": valid_dataset
    })

dataset = split_dataset(data)

# 2. ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î∞è Î™®Îç∏ Î°úÎìú
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# 3. Îç∞Ïù¥ÌÑ∞ÏÖã Ï†ÑÏ≤òÎ¶¨
def preprocess_function(examples):
    """
    Îç∞Ïù¥ÌÑ∞Î•º T5 ÏûÖÎ†• ÌòïÏãùÏúºÎ°ú Î≥ÄÌôò
    """
    inputs = examples["input"]  # 'input' Ïó¥ ÏÇ¨Ïö©
    targets = examples["output"]  # 'output' Ïó¥ ÏÇ¨Ïö©
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=1024, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels

    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Îç∞Ïù¥ÌÑ∞ÏÖã ÏÖîÌîåÎßÅ
tokenized_datasets["train"] = tokenized_datasets["train"].shuffle(seed=42)

# 4. ÌïôÏäµ ÏÑ§Ï†ï
training_args = TrainingArguments(
    output_dir="./results",  # Ï∂úÎ†• ÎîîÎ†âÌÜ†Î¶¨
    evaluation_strategy="epoch",  # ÌèâÍ∞Ä Ï£ºÍ∏∞
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,  # Ï≤¥ÌÅ¨Ìè¨Ïù∏Ìä∏ Ï†úÌïú
    save_strategy="epoch",
    logging_dir="./logs",  # Î°úÍπÖ ÎîîÎ†âÌÜ†Î¶¨
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",  # Í≤ÄÏ¶ù Í∏∞Ï§Ä
)

# 5. Ìä∏Î†àÏù¥ÎÑà Ï¥àÍ∏∞Ìôî
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# 6. ÌïôÏäµ ÏãúÏûë
trainer.train()

# 7. Î™®Îç∏ Ï†ÄÏû•
trainer.save_model("./t5-custom-model")
tokenizer.save_pretrained("./t5-custom-model")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 244/244 [00:00<00:00, 1418.13 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [00:00<00:00, 1427.58 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 71/71 [00:00<00:00, 1577.32 examples/s]
  0%|          | 0/93 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


RuntimeError: MPS backend out of memory (MPS allocated: 17.85 GB, other allocations: 272.78 MB, max allowed: 18.13 GB). Tried to allocate 256.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [9]:
# 8. Î™®Îç∏ Î°úÎìú Î∞è ÌÖåÏä§Ìä∏ Ìï®Ïàò Íµ¨ÌòÑ
def load_and_test_model(dataset):
    """
    dataset: ÌÖåÏä§Ìä∏Ïóê ÏÇ¨Ïö©Ìï† Îç∞Ïù¥ÌÑ∞ÏÖã (Dataset Í∞ùÏ≤¥)
    """
    model = T5ForConditionalGeneration.from_pretrained("./t5-custom-model")
    tokenizer = T5Tokenizer.from_pretrained("./t5-custom-model")
    metric = evaluate.load("accuracy")

    def predict_function(examples):
        inputs = tokenizer(["fix: " + i for i in examples["input"]], return_tensors="pt", max_length=1024, padding="max_length", truncation=True)
        predictions = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        return {"predictions": decoded_predictions}

    predictions = dataset.map(predict_function, batched=True, batch_size=8)
    references = predictions["output"]
    results = metric.compute(predictions=predictions["predictions"], references=references)
    print(f"Accuracy: {results['accuracy']}")

    return results["accuracy"]  # Ï†ïÌôïÎèÑ Î∞òÌôò


if __name__ == "__main__":
    test_sentence = "I is a good boy." # Ïó¨Í∏∞Ïóê Î≥ÄÌôîÎ•º Ï§ÑÍ≤É
    corrected = load_and_test_model(tokenized_datasets["test"])  # ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÇ¨Ïö©
    print("ÏûÖÎ†• Î¨∏Ïû•:", test_sentence)
    print("Accuracy on test data:", corrected)

OSError: Incorrect path_or_model_id: './t5-custom-model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.