<a href="https://colab.research.google.com/github/dldmstj0531/GEC/blob/main/notebooks/model/baseline_T5%2BLoRA_251031.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================
# 1. 의존성 설치 (최초 1회)
# =========================================
print("--- Installing dependencies ---")
!pip -q install "transformers>=4.36" datasets peft accelerate evaluate sacrebleu python-Levenshtein
!pip -q install errant # ERRANT 평가 도구 설치

# ERRANT는 spacy 영어 모델이 필요
!python -m spacy download en_core_web_sm
print("--- Dependencies installed ---")

In [None]:
# =========================================
# 2. 구글 드라이브 마운트
# =========================================
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!cp "/content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/sentence_pairs_20K.csv" "/content/sentence_pairs_20K.csv"
!cp "/content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/sentence_pairs_190K.csv" "/content/sentence_pairs_190K.csv"
!cp "/content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/bea19_train.csv" "/content/bea19_train.csv"
!cp "/content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/wi+locness/test/ABCN.test.bea19.orig" "/content/ABCN.test.bea19.orig"

In [None]:
# =========================================
# 3. 경로 & 하이퍼파라미터 설정
# =========================================
import os
import pandas as pd
from dataclasses import dataclass
from typing import List

@dataclass
class Config:
    # === 1. Paths ===
    # 소스 데이터 경로
    C4_PATH: str            = "/content/sentence_pairs_190K.csv"
    BEA19_PATH: str         = "/content/bea19_train.csv"

    # 통합 데이터 저장 경로
    COMBINED_PATH: str      = "/content/c4_bea_combined_train.csv"

    # 공식 테스트셋 경로
    OFFICIAL_TEST_PATH: str = "/content/ABCN.test.bea19.orig"
    # (ERRANT용) 공식 테스트셋 정답 M2 파일 경로
    OFFICIAL_TEST_M2: str   = "/content/ABCN.test.bea19.m2"

    # 모델 출력 디렉토리
    OUTPUT_DIR: str         = "/content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/output_combined_t5base"

    # === 2. Data ===
    # 데이터 중 샘플링할 개수 (0 이하는 전체 사용)
    C4_SAMPLE_N: int        = 60000
    BEA_SAMPLE_N: int       = 20000

    # === 3. Model ===
    MODEL_NAME: str = "t5-base" # t5-small -> t5-base
    SEED: int       = 42

    # === 4. LoRA ===
    LORA_R: int         = 64    # 32 -> 64 (t5-base에 맞춰 용량 증가)
    LORA_ALPHA: int     = 64    # 32 -> 64 (alpha=r로 설정)
    LORA_DROPOUT: float = 0.1

    # === 5. Tokenization ===
    PREFIX: str             = "grammar correction: "
    MAX_INPUT_LENGTH: int   = 128
    MAX_TARGET_LENGTH: int  = 128

    # === 6. Train ===
    EPOCHS: int       = 4     # C4(190k), BEA(23k) -> 50k+23k=73k. 4~5 에포크.
    LR: float         = 1e-4  # 1단계 학습이므로 1e-4

    # t5-base OOM 방지를 위해 배치 크기↓, 누적↑
    TRAIN_BS: int     = 4     # 8 -> 4
    EVAL_BS: int      = 4     # 8 -> 4
    GRAD_ACCUM: int   = 8     # 4 -> 8 (총 배치 4*8=32 유지)

    WARMUP_STEPS: int     = 500
    WEIGHT_DECAY: float   = 0.01
    LOGGING_STEPS: int    = 100
    LABEL_SMOOTHING: float = 0.1
    FP16: bool = True

    # === 7. Generation (추론) ===
    NUM_BEAMS: int              = 8
    LENGTH_PENALTY: float       = 0.7
    MAX_NEW_TOKENS: int         = 128
    NO_REPEAT_NGRAM: int        = 3
    REPETITION_PENALTY: float   = 1.07
    PREDICT_BS: int             = 16 # 추론은 배치 크기 16 가능
    MIN_EDIT_RATE: float        = 0.01

# 설정 객체 생성
CFG = Config()
print("Config loaded (LoRA r=64, alpha=64):")
print(CFG)

In [None]:
# =======================================================
# 4. 유틸/모델 함수 정의 (노트북 버전, argparse 없음)
# =======================================================
import os, re
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    DataCollatorForSeq2Seq, Trainer, TrainingArguments, set_seed,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch.utils.data import DataLoader

try:
    import evaluate
    _EVAL_OK = True
except Exception:
    _EVAL_OK = False

try:
    from Levenshtein import distance as lev_distance
    _LEV_OK = True
except Exception:
    _LEV_OK = False

# ----------------------
# 4-1. build_lora_t5
# ----------------------
def build_lora_t5(model_name: str, r: int, alpha: int, dropout: float):
    print("\n===== build_lora_t5 =====")
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    base_model = T5ForConditionalGeneration.from_pretrained(model_name)

    lora_cfg = LoraConfig(
        r = r,
        lora_alpha = alpha,
        lora_dropout = dropout,
        bias = "none",
        task_type = TaskType.SEQ_2_SEQ_LM,
        target_modules = ["q","k","v","o","wi","wo"],
    )

    model = get_peft_model(base_model, lora_cfg)
    model.print_trainable_parameters()
    return tokenizer, model

# -------------------------
# 4-2. load_csv_dataset
# -------------------------
def load_csv_dataset(train_csv: str, val_split: float=0.01) -> DatasetDict:
    print("\n===== load_csv_dataset =====")
    raw = load_dataset("csv", data_files={"train": train_csv}, split="train")
    for col in ["noise","clean"]:
        assert col in raw.column_names, f"CSV must contain column '{col}'"
    split = raw.train_test_split(test_size=val_split, seed=42)
    return DatasetDict({"train": split["train"], "validation": split["test"]})

# -------------------------
# 4-3. make_preprocess
# -------------------------
def make_preprocess(tokenizer: T5Tokenizer, prefix: str, max_src: int, max_tgt: int):
    print("\n===== make_preprocess =====")
    def _fn(examples):
        inputs = [prefix + x for x in examples["noise"]]
        model_inputs = tokenizer(inputs, max_length=max_src, truncation=True)
        labels = tokenizer(text_target=examples["clean"], max_length=max_tgt, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    return _fn

# --------------------
# 4-4. post_detok
# --------------------
def post_detok(text: str) -> str:
    # print("\n===== post_detok =====")
    text = text.strip()

    # --- 기본 구두점 공백 정리 ---
    text = (text
        .replace(" ,", ",").replace(" .", ".")
        .replace(" !", "!").replace(" ?", "?")
        .replace(" ;", ";").replace(" :", ":")
        .replace(" '", "'").replace(" n't", "n't")
        .replace(" ’", "’")
    )

    # --- 하이픈(-) 주변 공백 정리 ---
    # medium - sized → medium-sized
    text = re.sub(r"\s*-\s*", "-", text)

    # --- 괄호 주변 공백 정리 ---
    # ( word ) → (word)
    text = re.sub(r"\(\s+", "(", text)
    text = re.sub(r"\s+\)", ")", text)

    # --- 따옴표 주변 정리 ---
    text = re.sub(r'\s+"', '"', text)
    text = re.sub(r'"\s+', '"', text)
    text = re.sub(r"\s+'", "'", text)
    text = re.sub(r"'\s+", "'", text)

    # --- 중복 공백 축소 ---
    text = re.sub(r"\s{2,}", " ", text)

    return text

# ------------------------
# 4-5. force_min_edit
# ------------------------
def force_min_edit(src_texts: list, hyp_texts: list, min_edit_rate: float) -> list:
    print("\n===== force_min_edit =====")
    if not _LEV_OK or min_edit_rate <= 0:
        return [post_detok(h) for h in hyp_texts]
    fixed = []
    for s, h in zip(src_texts, hyp_texts):
        s2 = s.strip();
        h2 = post_detok(h)
        er = lev_distance(s2, h2) / max(1, len(s2))
        fixed.append(h2)
    return fixed

# -------------------
# 4-6. train_gec
# -------------------
def train_gec(cfg: Config, n_val_samples=1000): # n_train_samples 제거
    print("\n===== train_gec =====")
    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
    set_seed(cfg.SEED)

    tokenizer, model = build_lora_t5(cfg.MODEL_NAME, cfg.LORA_R, cfg.LORA_ALPHA, cfg.LORA_DROPOUT)
    datasets = load_csv_dataset(cfg.COMBINED_PATH, val_split=0.01)
    preprocess_fn = make_preprocess(tokenizer, cfg.PREFIX, cfg.MAX_INPUT_LENGTH, cfg.MAX_TARGET_LENGTH)
    tokenized = datasets.map(preprocess_fn, batched=True, remove_columns=datasets["train"].column_names)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    print("--- Using full combined training data ---")
    if n_val_samples:
        print(f"--- Validation data sampling: {n_val_samples} rows ---")
        N_VAL = n_val_samples
        tokenized["validation"] = tokenized["validation"].shuffle(seed=42).select(range(min(N_VAL, len(tokenized["validation"]))))
    else:
        print("--- Using full validation data ---")

    # BLEU 평가 지표 (빠른 검증용)
    metric = evaluate.load("sacrebleu") if _EVAL_OK else None
    def compute_metrics(eval_preds):
        if metric is None: return {}
        preds, labels = eval_preds
        if isinstance(preds, tuple): preds = preds[0]

        # 0보다 작거나 vocab_size보다 큰 ID 모두 필터링
        vocab_size = tokenizer.vocab_size
        preds = np.where(
            (preds >= 0) & (preds < vocab_size),
            preds,
            tokenizer.pad_token_id
        )

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [post_detok(p) for p in decoded_preds]
        decoded_labels = [[post_detok(l)] for l in decoded_labels]
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # Config 객체(cfg)에서 직접 인자 가져오기
    args = Seq2SeqTrainingArguments(
        output_dir = cfg.OUTPUT_DIR,
        num_train_epochs = cfg.EPOCHS,
        per_device_train_batch_size = cfg.TRAIN_BS,
        per_device_eval_batch_size = cfg.EVAL_BS,
        gradient_accumulation_steps = cfg.GRAD_ACCUM,
        warmup_steps = cfg.WARMUP_STEPS,
        weight_decay = cfg.WEIGHT_DECAY,
        logging_dir = os.path.join(cfg.OUTPUT_DIR, "logs"),
        logging_steps = cfg.LOGGING_STEPS,
        eval_strategy = "epoch",
        save_strategy = "epoch",
        load_best_model_at_end = True,
        metric_for_best_model = "loss",
        greater_is_better = False,
        optim = "adafactor",
        learning_rate = cfg.LR,
        label_smoothing_factor = cfg.LABEL_SMOOTHING,
        fp16 = cfg.FP16,
        report_to = "none",
        predict_with_generate=True,
        lr_scheduler_type="cosine"
    )
    args.generation_num_beams = cfg.NUM_BEAMS
    args.generation_length_penalty = cfg.LENGTH_PENALTY
    args.generation_max_new_tokens = cfg.MAX_NEW_TOKENS
    args.generation_no_repeat_ngram_size = cfg.NO_REPEAT_NGRAM
    args.generation_repetition_penalty = cfg.REPETITION_PENALTY

    # Seq2SeqTrainer로 학습
    trainer = Seq2SeqTrainer(
        model = model,
        args = args,
        train_dataset = tokenized["train"],
        eval_dataset = tokenized["validation"],
        data_collator = data_collator,
        tokenizer = tokenizer,
        compute_metrics = compute_metrics if _EVAL_OK else None,
    )

    # 학습 시작
    torch.cuda.empty_cache()
    print("\n--- LoRA fine-tuning start ---")
    trainer.train()
    print("--- Training finished ---\n")

    trainer.save_model(os.path.join(cfg.OUTPUT_DIR, "best_t5_lora_model"))
    print(f"Best model saved to {os.path.join(cfg.OUTPUT_DIR, 'best_t5_lora_model')}")
    return trainer, tokenizer

# -------------------------------
# 4-7. predict_official_test
# -------------------------------
def predict_official_test(trainer, tokenizer, cfg: Config):
    print("\n===== predict_official_test =====")
    if not cfg.OFFICIAL_TEST_PATH or not os.path.exists(cfg.OFFICIAL_TEST_PATH):
        print("[Skip] OFFICIAL_TEST_PATH not set or file not found.")
        return None

    test_ds = load_dataset("text", data_files={"test": cfg.OFFICIAL_TEST_PATH})["test"]
    def preprocess_test(examples):
        inputs = [cfg.PREFIX + x for x in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=cfg.MAX_INPUT_LENGTH, truncation=True)
        return model_inputs
    tok_test = test_ds.map(preprocess_test, batched=True, remove_columns=["text"])
    tok_test.set_format(type="torch", columns=["input_ids","attention_mask"])

    model = trainer.model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=model, padding=True
    )
    dl = DataLoader(
        tok_test, batch_size=cfg.PREDICT_BS, collate_fn=data_collator
    )

    decoded_preds = []
    vocab_size = tokenizer.vocab_size # Get vocabulary size
    with torch.no_grad():
        for batch in dl:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            gen = model.generate (
                input_ids = input_ids,
                attention_mask = attention_mask,
                num_beams = cfg.NUM_BEAMS,
                length_penalty = cfg.LENGTH_PENALTY,
                max_new_tokens = cfg.MAX_NEW_TOKENS,
                no_repeat_ngram_size = cfg.NO_REPEAT_NGRAM,
                repetition_penalty = cfg.REPETITION_PENALTY,
            )

            # 에러 수정: 0보다 작거나 vocab_size보다 큰 ID 모두 필터링
            gen = torch.where(
                (gen >= 0) & (gen < vocab_size), # 조건
                gen,                             # 참
                tokenizer.pad_token_id           # 거짓
            )

            batch_preds = tokenizer.batch_decode(gen, skip_special_tokens=True)
            decoded_preds.extend([post_detok(p) for p in batch_preds])

    raw_src = [x["text"] for x in load_dataset("text", data_files={"test": cfg.OFFICIAL_TEST_PATH})["test"]]
    decoded_preds = force_min_edit(raw_src, decoded_preds, cfg.MIN_EDIT_RATE)

    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    out_path = os.path.join(cfg.OUTPUT_DIR, f"submission.txt")
    with open(out_path, "w", encoding="utf-8") as f:
        for line in decoded_preds:
            f.write(line.strip()+"\n")
    print(f"Saved predictions -> {out_path}")

    print("\n--- Samples ---")
    for i in range(min(5, len(raw_src))):
        print(f"Original ({i+1}): {raw_src[i]}")
        print(f"Corrected ({i+1}): {decoded_preds[i]}\n")
    return out_path # (수정) ERRANT에서 사용할 수 있도록 경로 반환

# --------------------------
# 4-8. correct_sentence
# --------------------------
def correct_sentence(text: str, model, tokenizer, cfg: Config):
    print("\n===== correct_sentence =====")
    inputs_text = cfg.PREFIX + text
    inputs = tokenizer(
        inputs_text,
        return_tensors="pt",
        max_length=cfg.MAX_INPUT_LENGTH,
        truncation=True
    ).to(device)

    with torch.no_grad():
        gen = model.generate (
            input_ids = inputs["input_ids"],
            attention_mask = inputs["attention_mask"],
            num_beams = cfg.NUM_BEAMS,
            length_penalty = cfg.LENGTH_PENALTY,
            max_new_tokens = cfg.MAX_NEW_TOKENS,
            no_repeat_ngram_size = cfg.NO_REPEAT_NGRAM,
            repetition_penalty = cfg.REPETITION_PENALTY,
        )

    # 에러 수정: 0보다 작거나 vocab_size보다 큰 ID 모두 필터링
    vocab_size = tokenizer.vocab_size
    gen = torch.where(
        (gen >= 0) & (gen < vocab_size), # 조건
        gen,                             # 참
        tokenizer.pad_token_id           # 거짓
    )

    result_text = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
    final_result = post_detok(result_text)
    return final_result

print("Functions loaded.")

In [None]:
# ===================================================
# 5. (통합) 학습 데이터 준비 (C4 + BEA-19)
# ===================================================

print("--- (통합) 학습 데이터 준비 시작 ---")

# 1. 경로 및 샘플링 설정 (CFG 객체에서 로드)
C4_PATH = CFG.C4_PATH
BEA19_PATH = CFG.BEA19_PATH
COMBINED_PATH = CFG.COMBINED_PATH
C4_SAMPLE_N = CFG.C4_SAMPLE_N
BEA_SAMPLE_N = CFG.BEA_SAMPLE_N
BEA_OVERSAMPLE_FACTOR = 3       # BEA-19 데이터를 3배 복제

# 2. 데이터 파일 존재 여부 확인
if not os.path.exists(C4_PATH) or not os.path.exists(BEA19_PATH):
    raise FileNotFoundError(f"경고: {C4_PATH} 또는 {BEA19_PATH} 파일을 찾을 수 없습니다.")
else:
    # 3. C4 로드 및 샘플링 (테스트용 50개)
    print(f"Loading C4 data from: {C4_PATH}")
    df_c4 = pd.read_csv(C4_PATH) # 약 190k
    print(f"C4 data (original): {len(df_c4)} rows")

    if C4_SAMPLE_N > 0 and len(df_c4) > C4_SAMPLE_N:
        print(f"C4 data sampling: {len(df_c4)} -> {C4_SAMPLE_N} rows")
        df_c4 = df_c4.sample(n=C4_SAMPLE_N, random_state=CFG.SEED)
    else:
        print(f"Using full C4 data: {len(df_c4)} rows")

    # 4. BEA-19 로드 및 샘플링 (테스트용 50개)
    print(f"Loading BEA-19 data from: {BEA19_PATH}")
    df_bea = pd.read_csv(BEA19_PATH)
    print(f"BEA-19 data (original): {len(df_bea)} rows")

    if BEA_SAMPLE_N > 0 and len(df_bea) > BEA_SAMPLE_N:
        print(f"BEA-19 data sampling: {len(df_bea)} -> {BEA_SAMPLE_N} rows")
        df_bea = df_bea.sample(n=BEA_SAMPLE_N, random_state=CFG.SEED)
    else:
        # 50개보다 적을 경우 그냥 셔플
         df_bea = df_bea.sample(n=len(df_bea), random_state=CFG.SEED)

    # 5. C4, BEA19 컬럼명 통일
    RENAME_DICT = {"source" : "noise", "target" : "clean"}
    if "source" not in df_bea.columns and "incorrect" in df_bea.columns:
        RENAME_DICT = {"incorrect": "noise", "correct": "clean"}

    print(f"Renaming BEA-19 columns: {RENAME_DICT}")
    df_bea.rename(columns=RENAME_DICT, inplace=True)

    if "noise" not in df_bea.columns or "clean" not in df_bea.columns:
         raise KeyError("BEA-19 컬럼명을 'noise', 'clean'으로 변경하지 못했습니다.")

    # --------------------------------------------------
    # 데이터 품질 향상 (Cleaning)
    # --------------------------------------------------
    def clean_data(df, name):
        print(f"\n--- Cleaning {name} data (Original: {len(df)}) ---")

        # 1. 결측치 제거 (astype(str)로 안전하게 처리)
        df.dropna(subset=["noise", "clean"], inplace=True)
        print(f"After dropna: {len(df)}")

        # 2. (no-op) noise와 clean이 동일한 데이터 제거
        df = df[df["noise"].astype(str) != df["clean"].astype(str)]
        print(f"After no-op removal: {len(df)}")

        # 3. 너무 짧거나(3단어 미만) 긴(100단어 초과) 문장 제거
        min_words = 3
        max_words = 128
        noise_words = df["noise"].astype(str).str.split().str.len()
        clean_words = df["clean"].astype(str).str.split().str.len()

        df = df[
            (noise_words >= min_words) & (noise_words <= max_words) &
            (clean_words >= min_words) & (clean_words <= max_words)
        ]
        print(f"After length filter ({min_words}-{max_words} words): {len(df)}")
        return df

    df_c4 = clean_data(df_c4, "C4 (sampled 50)")
    df_bea = clean_data(df_bea, "BEA-19 (sampled 50)")
    # --------------------------------------------------

    # 5. BEA-19 데이터 2배 Over-sampling 후 결합
    print(f"\nConcatenating C4 + (BEA-19 * {BEA_OVERSAMPLE_FACTOR})...")

    # BEA-19 데이터를 2번 복제한 리스트 생성
    bea_dfs = [df_bea] * BEA_OVERSAMPLE_FACTOR

    df_combined = pd.concat([df_c4] + bea_dfs, ignore_index=True)

    print(f"Combined data (C4: {len(df_c4)}, BEA x{BEA_OVERSAMPLE_FACTOR}: {len(df_bea)*BEA_OVERSAMPLE_FACTOR})")

    # 6. 셔플 및 저장
    df_combined = df_combined.sample(frac=1, random_state=CFG.SEED).reset_index(drop=True)
    df_combined.to_csv(COMBINED_PATH, index=False)

    print(f"\nTotal combined data (after cleaning & oversampling): {len(df_combined)} rows")
    print(f"Combined data saved to: {COMBINED_PATH}")

In [None]:
# ===================================================
# 6. 통합 모델 학습
# ===================================================
print("\n--- (통합) 학습 시작 ---")

# 3번 섹션에서 정의한 CFG 객체를 그대로 사용합니다.
# CFG.TRAIN_CSV는 사용되지 않으며, CFG.COMBINED_PATH가 train_gec 내부에서 사용.
# n_val_samples=1000 : 통합 CSV에서 1000개만 샘플링하여 검증에 사용

trainer_combined, tokenizer_combined = train_gec(
    CFG,
    n_val_samples=None
)

print("--- 통합 모델 학습 완료 ---")

In [None]:
import time # 1. 시간 측정을 위해 time 모듈 임포트

# =========================================
# 7. 임의 문장 빠른 테스트 (Quick Test)
# =========================================

# '통합 모델' 사용
model = trainer_combined.model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 테스트할 문장 목록
test_sentences = [
    "He go to school every day.",
    "She has two child.",
    "She is teacher.",
    "He arrived to the airport on time.",
    "I every day go to school.",
    "He told that he was tired. ",
    "This is very importent information."
]

print("\n===============================")
print("--- 임의 문장 테스트 시작 (시간 측정) ---")
print("===============================\n")

# 전체 시간 측정을 위한 변수 (선택 사항)
total_duration = 0

for sentence in test_sentences:
    # 2. 개별 문장 처리 시작 시간 기록
    start_time = time.time()

    corrected = correct_sentence(sentence, model, tokenizer_combined, CFG)

    # 3. 개별 문장 처리 종료 시간 기록
    end_time = time.time()

    # 4. 소요 시간 계산
    duration = end_time - start_time
    total_duration += duration

    print(f"Original:  {sentence}")
    print(f"Corrected: {corrected}")
    # 5. 문장별 소요 시간 출력
    print(f"Time taken: {duration:.4f} seconds\n")

print("--- 테스트 완료 ---")
print(f"Total time for {len(test_sentences)} sentences: {total_duration:.4f} seconds")
print(f"Average time per sentence: {total_duration / len(test_sentences):.4f} seconds")

In [None]:
# =========================================
# 8. 공식 테스트셋 예측 → submission.txt 저장
# =========================================
print("\n--- (최종 통합 모델) 공식 테스트셋 예측 시작 ---")

# predict_official_test 함수는 submission.txt의 경로를 반환합니다.
submission_file_path = predict_official_test(trainer_combined, tokenizer_combined, CFG)

print(f"--- 예측 완료: {submission_file_path} ---")

In [None]:
!zip -j /content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/output_combined_t5base/submission.zip /content/drive/MyDrive/Projects/LikeLion/실전프로젝트02/output_combined_t5base/submission.txt