In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
# PyTorch
import torch

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'使用 GPU：{torch.cuda.get_device_name(0)}')
else:
    device = torch.device('cpu')
    print('使用 CPU')

使用 GPU：NVIDIA L4


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir("/content/drive/MyDrive/Database")

In [6]:
!pip show peft

Name: peft
Version: 0.15.2
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: benjamin@huggingface.co
License: Apache
Location: /usr/local/lib/python3.11/dist-packages
Requires: accelerate, huggingface_hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: 


In [7]:
!pip install --upgrade peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [8]:
import os
import time
import torch
import random
import numpy as np
import pandas as pd

from typing import Dict, List
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType  # 只留 LoraConfig, get_peft_model, TaskType

# =============================================================================
# 1. 固定亂數種子，確保結果可重現
# =============================================================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# =============================================================================
# 2. 資料載入與 Label Processing
# =============================================================================
def load_and_prepare(subjects: List[str]) -> pd.DataFrame:
    """
    讀取多個 subject 的資料，合併後做 label encoding：
      - label_subject
      - label_chapter
      - label_section
    """
    def load_and_merge(subject: str) -> pd.DataFrame:
        base_path = f"{subject}_Database"
        qdf = pd.read_csv(f"{base_path}/{subject}_question_bank.csv")
        cdf = pd.read_csv(f"{base_path}/{subject}_chapter_list.csv")
        qdf.columns = qdf.columns.str.strip().str.lower().str.replace(" ", "_")
        cdf.columns = cdf.columns.str.strip().str.lower().str.replace(" ", "_")
        df = qdf.merge(cdf, on="section_name", how="left")
        df["subject"] = subject
        return df

    # 合併指定 subjects 的資料
    df = pd.concat([load_and_merge(s) for s in subjects], ignore_index=True)
    df = df[["subject", "chapter_name_x", "section_name", "ques_detl"]].dropna().reset_index(drop=True)

    # subject label
    df["label_str"] = df["subject"]
    label2id_subject = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_subject = {i: lab for lab, i in label2id_subject.items()}
    df["label_subject"] = df["label_str"].map(label2id_subject)

    # chapter label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"]
    label2id_chapter = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_chapter = {i: lab for lab, i in label2id_chapter.items()}
    df["label_chapter"] = df["label_str"].map(label2id_chapter)

    # section label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"] + "::" + df["section_name"]
    label2id_section = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_section = {i: lab for lab, i in label2id_section.items()}
    df["label_section"] = df["label_str"].map(label2id_section)

    # 只保留至少出現兩次的 section
    vc = df["label_section"].value_counts()
    valid_secs = set(vc[vc >= 2].index)
    df = df[df["label_section"].isin(valid_secs)].reset_index(drop=True)

    return df, (label2id_subject, id2label_subject), (label2id_chapter, id2label_chapter), (label2id_section, id2label_section)

# =============================================================================
# 3. 自訂 Dataset
# =============================================================================
class TextDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        tokenizer,
        max_len: int = 128,
        mode: str = "flat_chapter",  # 'flat_chapter', 'flat_section', 'hierarchical'
        subject_labels: List[int] = None
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        self.subject_labels = subject_labels  # 只有 hierarchical 模式才需要

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        if self.mode == "hierarchical":
            item["subject_labels"] = torch.tensor(self.subject_labels[idx], dtype=torch.long)
        return item

# =============================================================================
# 4. TextCNN Model 定義
# =============================================================================
class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        num_classes: int,
        kernel_sizes: List[int] = [3,4,5],
        num_filters: int = 100,
        dropout_p: float = 0.5
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids):
        """
        input_ids: (B, L)
        return: logits (B, num_classes)
        """
        x = self.embedding(input_ids)        # (B, L, D)
        x = x.permute(0, 2, 1)               # (B, D, L)
        conv_outs = []
        for conv in self.convs:
            c = conv(x)                      # (B, F, L - k + 1)
            c = torch.relu(c)                # (B, F, L - k + 1)
            c = torch.max_pool1d(c, kernel_size=c.size(2))  # (B, F, 1)
            conv_outs.append(c.squeeze(2))   # (B, F)
        cat = torch.cat(conv_outs, dim=1)    # (B, F * len(kernel_sizes))
        drop = self.dropout(cat)             # (B, F * len(kernel_sizes))
        logits = self.fc(drop)               # (B, num_classes)
        return logits

# =============================================================================
# 5. 基礎 MLP Model 定義 (只用於 flat 模式)
# =============================================================================
class BasicMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_classes: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, features):
        """
        features: (B, input_dim)
        return: logits (B, num_classes)
        """
        x = self.fc1(features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

# =============================================================================
# 6. 訓練與評估函式
# =============================================================================
def compute_metrics(preds_and_labels) -> Dict[str, float]:
    logits, labels = preds_and_labels
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision_macro": prec,
        "recall_macro": rec,
        "f1_macro": f1
    }

def train_flat_transformer(
    model_name: str,
    num_labels: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    output_dir: str,
    device: torch.device,
    use_dora: bool = False
) -> Dict[str, float]:
    """
    訓練 flat (Subject+Chapter 或 flat Section) 模式的小型 Transformer (BERT/ RoBERTa)，回傳 metrics。
    如果 use_dora=True，會在模型上套 DoRA Adapter（使用 LoraConfig with use_dora=True）。
    """
    # 1. 選擇 tokenizer & base_model
    if "roberta" in model_name:
        tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        base_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    else:
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
        base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # 2. 如果要用 DoRA，包成 PEFT 模型
    if use_dora:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=["query", "value"],  # 對 BERT/RoBERTa 模型，"query" 和 "value" 即可
            use_dora=True,                      # 啟用 DoRA
            #distillation_loss_weight=0.2,
            #retention_loss_weight=0.1
        )
        model = get_peft_model(base_model, peft_config)
    else:
        model = base_model

    model.to(device)

    # 3. 構造 Dataset
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=128, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=128, mode="flat_chapter")

    # 4. TrainingArguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5 if not use_dora else 3e-4,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_strategy="epoch"
    )

    # 5. 初始化 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    # 6. 訓練 & 驗證
    start = time.time()
    trainer.train()
    elapsed = time.time() - start

    metrics = trainer.evaluate()

    # 7. 儲存 adapter（若有 DoRA）或整個模型
    #    如果 use_dora=True，Trainer.save_model() 會把 PEFT adapter 一併儲存到 output_dir
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # 8. 計算檔案大小
    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    metrics["eval_accuracy"],
        "precision":   metrics["eval_precision_macro"],
        "recall":      metrics["eval_recall_macro"],
        "f1":          metrics["eval_f1_macro"],
        "train_time":  elapsed,
        "model_size":  total_size
    }

def train_textcnn(
    vocab_size: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    tokenizer,
    num_labels: int,
    output_dir: str,
    device: torch.device
) -> Dict[str, float]:
    """
    訓練 TextCNN (flat_chapter) 模式，回傳 metrics。
    """
    max_len = 128
    batch_size = 32
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=max_len, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=max_len, mode="flat_chapter")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    embed_dim = 300
    model = TextCNN(vocab_size=vocab_size, embed_dim=embed_dim, num_classes=num_labels)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

    best_f1 = 0.0
    history = {"train_loss": [], "valid_loss": [], "train_f1": [], "valid_f1": []}

    for epoch in range(1, 6):
        model.train()
        total_loss = 0.0
        all_preds, all_labels = [], []
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            logits = model(input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * input_ids.size(0)
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())

        train_loss = total_loss / len(train_loader.dataset)
        _, _, train_f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average="macro", zero_division=0
        )

        model.eval()
        total_vloss = 0.0
        v_preds, v_labels = [], []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                logits = model(input_ids)
                loss = criterion(logits, labels)
                total_vloss += loss.item() * input_ids.size(0)
                preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
                v_preds.extend(preds)
                v_labels.extend(labels.detach().cpu().numpy())

        valid_loss = total_vloss / len(valid_loader.dataset)
        _, _, valid_f1, _ = precision_recall_fscore_support(
            v_labels, v_preds, average="macro", zero_division=0
        )

        scheduler.step()
        history["train_loss"].append(train_loss)
        history["valid_loss"].append(valid_loss)
        history["train_f1"].append(train_f1)
        history["valid_f1"].append(valid_f1)

        # 保存最佳模型
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            torch.save(model.state_dict(), f"{output_dir}/best_textcnn.pt")

        print(f"Epoch {epoch} | Train Loss {train_loss:.4f} | Valid Loss {valid_loss:.4f} | "
              f"Train F1 {train_f1:.4f} | Valid F1 {valid_f1:.4f}")

    # 存檔： tokenizer vocab
    tokenizer.save_pretrained(output_dir)

    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    None,
        "precision":   None,
        "recall":      None,
        "f1":          best_f1,
        "train_time":  None,
        "model_size":  total_size
    }

# =============================================================================
# 7. Pipeline 主程序：整合上述所有 case
# =============================================================================
def main_pipeline():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1. 載入資料與 Labels
    df, subj_map, chap_map, sect_map = load_and_prepare(["math", "science"]) # social
    label2id_subject, id2label_subject = subj_map
    label2id_chapter, id2label_chapter = chap_map
    label2id_section, id2label_section = sect_map

    # 2. 切 train/valid/test（90/10 → 再分81/9/10)
    base = df[["ques_detl", "label_subject", "label_chapter", "label_section", "subject"]].copy()

    # 先切出 10% 做最終 test
    rest, test = train_test_split(
        base, test_size=0.1, stratify=base["label_chapter"], random_state=42
    )

    # 再把 rest 切成 90/10 ≈ 81%/9% 做 train/valid
    train_df, valid_df = train_test_split(
        rest, test_size=0.1, stratify=rest["label_chapter"], random_state=42
    )

    # 取出 text 和各種 label lists
    train_texts = train_df["ques_detl"].tolist()
    valid_texts = valid_df["ques_detl"].tolist()
    test_texts  = test["ques_detl"].tolist()

    train_subj_labels = train_df["label_subject"].tolist()
    valid_subj_labels = valid_df["label_subject"].tolist()
    test_subj_labels  = test["label_subject"].tolist()

    train_chap_labels = train_df["label_chapter"].tolist()
    valid_chap_labels = valid_df["label_chapter"].tolist()
    test_chap_labels  = test["label_chapter"].tolist()

    train_sect_labels = train_df["label_section"].tolist()
    valid_sect_labels = valid_df["label_section"].tolist()
    test_sect_labels  = test["label_section"].tolist()

    # 3. 建立 tokenizer (給所有 TextCNN/Transformer 共用)
    bert_tok    = BertTokenizerFast.from_pretrained("bert-base-uncased")
    roberta_tok = RobertaTokenizerFast.from_pretrained("roberta-base")

    # 4. 建立詞彙大小 (TextCNN 用)
    vocab_size = bert_tok.vocab_size

    # 5. 實驗配置
    model_types = [
        ("DoRA_BERT",    "bert-base-uncased", True),
        ("DoRA_RoBERTa", "roberta-base",       True),
        ("BERT",         "bert-base-uncased", False),
        ("RoBERTa",      "roberta-base",       False),
    ]
    others = ["TextCNN", "MLP"]

    # 6. 三種分類策略
    strategies = [
        "flat_chapter",          # 直接分類 subject+chapter
        "flat_section_then_map", # 先分類 subject+chapter+section，再 map 回 chapter
        "hierarchical"           # 先分 subject，再分 chapter
    ]

    results = []

    for model_name, hf_model_name, use_dora in model_types:
        for strat in strategies:
            exp_name = f"{model_name}__{strat}"
            output_dir = f"./outputs/{exp_name}"
            os.makedirs(output_dir, exist_ok=True)

            print(f"\n▶開始實驗: {exp_name}")

            if strat == "flat_chapter":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_chapter),
                    train_texts=train_texts,
                    train_labels=train_chap_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_chap_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                results.append((exp_name, metrics))

            elif strat == "flat_section_then_map":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_section),
                    train_texts=train_texts,
                    train_labels=train_sect_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_sect_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                # 如需計算 chapter fine，須先批次推論再做 mapping
                results.append((exp_name, metrics))

            elif strat == "hierarchical":
                sub_output = os.path.join(output_dir, "subject_model")
                chap_output = os.path.join(output_dir, "chapter_model")
                os.makedirs(sub_output, exist_ok=True)
                os.makedirs(chap_output, exist_ok=True)

                # 1) 訓練 subject
                sub_metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_subject),
                    train_texts=train_texts,
                    train_labels=train_subj_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_subj_labels,
                    output_dir=sub_output,
                    device=device,
                    use_dora=use_dora
                )

                # 2) 針對每個 subject 訓練 chapter classifier
                chap_metrics = {}
                for subj_id, subj_name in id2label_subject.items():
                    idx_train = [i for i, s in enumerate(train_subj_labels) if s == subj_id]
                    idx_valid = [i for i, s in enumerate(valid_subj_labels) if s == subj_id]

                    sub_train_texts = [train_texts[i] for i in idx_train]
                    sub_train_chaps = [train_chap_labels[i] for i in idx_train]
                    sub_valid_texts = [valid_texts[i] for i in idx_valid]
                    sub_valid_chaps = [valid_chap_labels[i] for i in idx_valid]

                    unique_chaps = sorted({train_chap_labels[i] for i in idx_train})
                    if len(unique_chaps) < 2:
                        continue

                    sub_dir = os.path.join(chap_output, f"subj_{subj_id}")
                    os.makedirs(sub_dir, exist_ok=True)
                    sub_chap_m = train_flat_transformer(
                        model_name=hf_model_name,
                        num_labels=len(unique_chaps),
                        train_texts=sub_train_texts,
                        train_labels=sub_train_chaps,
                        valid_texts=sub_valid_texts,
                        valid_labels=sub_valid_chaps,
                        output_dir=sub_dir,
                        device=device,
                        use_dora=use_dora
                    )
                    chap_metrics[subj_id] = sub_chap_m

                results.append((exp_name, {"subject": sub_metrics, "chapter": chap_metrics}))

    # TextCNN + MLP 只負責 flat_chapter
    for model_name in others:
        strat = "flat_chapter"
        exp_name = f"{model_name}__{strat}"
        output_dir = f"./outputs/{exp_name}"
        os.makedirs(output_dir, exist_ok=True)
        print(f"\n▶開始實驗: {exp_name}")

        if model_name == "TextCNN":
            start = time.time()
            metrics = train_textcnn(
                vocab_size=vocab_size,
                train_texts=train_texts,
                train_labels=train_chap_labels,
                valid_texts=valid_texts,
                valid_labels=valid_chap_labels,
                tokenizer=bert_tok,
                num_labels=len(label2id_chapter),
                output_dir=output_dir,
                device=device
            )
            elapsed = time.time() - start
            metrics["train_time"] = elapsed
            results.append((exp_name, metrics))

        elif model_name == "MLP":
            train_enc = bert_tok(train_texts, padding=True, truncation=True, return_tensors="pt")
            valid_enc = bert_tok(valid_texts, padding=True, truncation=True, return_tensors="pt")

            bert_model = BertForSequenceClassification.from_pretrained(
                "bert-base-uncased", output_hidden_states=True
            )
            bert_model.to(device)
            bert_model.eval()

            def encode_avg(inputs):
                input_ids = inputs["input_ids"].to(device)
                attention_mask = inputs["attention_mask"].to(device)
                with torch.no_grad():
                    outputs = bert_model.bert(input_ids, attention_mask=attention_mask)
                    last_hidden = outputs.last_hidden_state  # (B, L, D)
                    mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
                    summed = torch.sum(last_hidden * mask, 1)
                    counts = torch.clamp(mask.sum(1), min=1e-9)
                    avg_pooled = summed / counts
                    return avg_pooled.cpu()

            train_feats = encode_avg(train_enc)
            valid_feats = encode_avg(valid_enc)

            input_dim = train_feats.size(1)
            hidden_dim = 256
            model = BasicMLP(input_dim, hidden_dim, num_classes=len(label2id_chapter))
            model.to(device)

            train_labels_tensor = torch.tensor(train_chap_labels)
            valid_labels_tensor = torch.tensor(valid_chap_labels)

            optimizer = optim.Adam(model.parameters(), lr=1e-4)
            criterion = nn.CrossEntropyLoss()

            start = time.time()
            best_f1 = 0.0
            for epoch in range(1, 6):
                model.train()
                optimizer.zero_grad()
                logits = model(train_feats.to(device))
                loss = criterion(logits, train_labels_tensor.to(device))
                loss.backward()
                optimizer.step()

                model.eval()
                with torch.no_grad():
                    v_logits = model(valid_feats.to(device))
                    v_preds = torch.argmax(v_logits, dim=1).cpu().numpy()
                    v_labels = valid_labels_tensor.numpy()
                    _, _, v_f1, _ = precision_recall_fscore_support(v_labels, v_preds, average="macro", zero_division=0)

                if v_f1 > best_f1:
                    best_f1 = v_f1
                    torch.save(model.state_dict(), f"{output_dir}/best_mlp.pt")
                print(f"Epoch {epoch} | Valid F1 {v_f1:.4f}")

            elapsed = time.time() - start
            total_size = 0
            for root, _, files in os.walk(output_dir):
                for fname in files:
                    total_size += os.path.getsize(os.path.join(root, fname))

            results.append((exp_name, {
                "accuracy":    None,
                "precision":   None,
                "recall":      None,
                "f1":          best_f1,
                "train_time":  elapsed,
                "model_size":  total_size
            }))

    # 8. 把所有實驗結果匯出
    out_df = pd.DataFrame([{"experiment": name, **metrics} for name, metrics in results])
    out_df.to_csv("experiment_results.csv", index=False)
    print("\n所有實驗完成，結果已存到 experiment_results.csv")

if __name__ == "__main__":
    main_pipeline()



▶開始實驗: DoRA_BERT__flat_chapter


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,3.1492,2.792263,0.207081,0.076804,0.105434,0.074639
2,2.5389,2.272581,0.311289,0.174444,0.191384,0.154444
3,2.2093,2.02157,0.410154,0.264218,0.273097,0.240184
4,1.9597,1.791495,0.476954,0.386619,0.339415,0.328136
5,1.7875,1.73393,0.478958,0.349968,0.351053,0.325956
6,1.668,1.606124,0.523714,0.384601,0.3927,0.369475
7,1.5858,1.561748,0.544422,0.407062,0.411827,0.393817
8,1.5269,1.552831,0.539078,0.417687,0.405769,0.389944



▶開始實驗: DoRA_BERT__flat_section_then_map


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,4.209,3.709156,0.130929,0.032137,0.055551,0.031257
2,3.5359,3.280009,0.189045,0.051447,0.091451,0.055788
3,3.1668,2.994778,0.236473,0.112187,0.121835,0.094315
4,2.9114,2.767603,0.303273,0.162794,0.173778,0.14812
5,2.7098,2.61762,0.343353,0.194172,0.205708,0.174717
6,2.5614,2.516973,0.363393,0.21487,0.223808,0.195473
7,2.4483,2.469521,0.376754,0.225238,0.240891,0.208715
8,2.3846,2.422845,0.405478,0.260401,0.259907,0.231681



▶開始實驗: DoRA_BERT__hierarchical


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.1181,0.062055,0.984636,0.983145,0.984623,0.983871
2,0.0495,0.049061,0.98664,0.984337,0.987817,0.986004
3,0.033,0.041935,0.989312,0.990348,0.98721,0.988727
4,0.0247,0.040434,0.992652,0.993052,0.991498,0.992262
5,0.0196,0.031873,0.992652,0.993052,0.991498,0.992262
6,0.0153,0.036008,0.992652,0.993378,0.991187,0.992257
7,0.0093,0.036319,0.99332,0.993597,0.992355,0.992968
8,0.0076,0.040116,0.992652,0.993052,0.991498,0.992262


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,2.5212,1.973681,0.363239,0.302326,0.25006,0.198539
2,1.8954,1.649032,0.459519,0.39614,0.360661,0.332703
3,1.6347,1.475816,0.506565,0.483296,0.417393,0.409331
4,1.4612,1.35781,0.551422,0.508349,0.478792,0.477227
5,1.3077,1.272494,0.583151,0.560277,0.529086,0.52615
6,1.217,1.207253,0.612691,0.603191,0.565923,0.561131
7,1.1417,1.153723,0.626915,0.591334,0.57872,0.573086
8,1.0981,1.145888,0.636761,0.599352,0.592535,0.585592


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [7]:
import os
import time
import torch
import random
import numpy as np
import pandas as pd

from typing import Dict, List
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType  # 只留 LoraConfig, get_peft_model, TaskType

# =============================================================================
# 1. 固定亂數種子，確保結果可重現
# =============================================================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# =============================================================================
# 2. 資料載入與 Label Processing
# =============================================================================
def load_and_prepare(subjects: List[str]) -> pd.DataFrame:
    """
    讀取多個 subject 的資料，合併後做 label encoding：
      - label_subject
      - label_chapter
      - label_section
    """
    def load_and_merge(subject: str) -> pd.DataFrame:
        base_path = f"{subject}_Database"
        qdf = pd.read_csv(f"{base_path}/{subject}_question_bank.csv")
        cdf = pd.read_csv(f"{base_path}/{subject}_chapter_list.csv")
        qdf.columns = qdf.columns.str.strip().str.lower().str.replace(" ", "_")
        cdf.columns = cdf.columns.str.strip().str.lower().str.replace(" ", "_")
        df = qdf.merge(cdf, on="section_name", how="left")
        df["subject"] = subject
        return df

    # 合併指定 subjects 的資料
    df = pd.concat([load_and_merge(s) for s in subjects], ignore_index=True)
    df = df[["subject", "chapter_name_x", "section_name", "ques_detl"]].dropna().reset_index(drop=True)

    # subject label
    df["label_str"] = df["subject"]
    label2id_subject = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_subject = {i: lab for lab, i in label2id_subject.items()}
    df["label_subject"] = df["label_str"].map(label2id_subject)

    # chapter label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"]
    label2id_chapter = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_chapter = {i: lab for lab, i in label2id_chapter.items()}
    df["label_chapter"] = df["label_str"].map(label2id_chapter)

    # section label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"] + "::" + df["section_name"]
    label2id_section = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_section = {i: lab for lab, i in label2id_section.items()}
    df["label_section"] = df["label_str"].map(label2id_section)

    # 只保留至少出現兩次的 section
    vc = df["label_section"].value_counts()
    valid_secs = set(vc[vc >= 2].index)
    df = df[df["label_section"].isin(valid_secs)].reset_index(drop=True)

    return df, (label2id_subject, id2label_subject), (label2id_chapter, id2label_chapter), (label2id_section, id2label_section)

# =============================================================================
# 3. 自訂 Dataset
# =============================================================================
class TextDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        tokenizer,
        max_len: int = 128,
        mode: str = "flat_chapter",  # 'flat_chapter', 'flat_section', 'hierarchical'
        subject_labels: List[int] = None
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        self.subject_labels = subject_labels  # 只有 hierarchical 模式才需要

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        if self.mode == "hierarchical":
            item["subject_labels"] = torch.tensor(self.subject_labels[idx], dtype=torch.long)
        return item

# =============================================================================
# 4. TextCNN Model 定義
# =============================================================================
class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        num_classes: int,
        kernel_sizes: List[int] = [3,4,5],
        num_filters: int = 100,
        dropout_p: float = 0.5
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids):
        """
        input_ids: (B, L)
        return: logits (B, num_classes)
        """
        x = self.embedding(input_ids)        # (B, L, D)
        x = x.permute(0, 2, 1)               # (B, D, L)
        conv_outs = []
        for conv in self.convs:
            c = conv(x)                      # (B, F, L - k + 1)
            c = torch.relu(c)                # (B, F, L - k + 1)
            c = torch.max_pool1d(c, kernel_size=c.size(2))  # (B, F, 1)
            conv_outs.append(c.squeeze(2))   # (B, F)
        cat = torch.cat(conv_outs, dim=1)    # (B, F * len(kernel_sizes))
        drop = self.dropout(cat)             # (B, F * len(kernel_sizes))
        logits = self.fc(drop)               # (B, num_classes)
        return logits

# =============================================================================
# 5. 基礎 MLP Model 定義 (只用於 flat 模式)
# =============================================================================
class BasicMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_classes: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, features):
        """
        features: (B, input_dim)
        return: logits (B, num_classes)
        """
        x = self.fc1(features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

# =============================================================================
# 6. 訓練與評估函式
# =============================================================================
def compute_metrics(preds_and_labels) -> Dict[str, float]:
    logits, labels = preds_and_labels
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision_macro": prec,
        "recall_macro": rec,
        "f1_macro": f1
    }

def train_flat_transformer(
    model_name: str,
    num_labels: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    output_dir: str,
    device: torch.device,
    use_dora: bool = False
) -> Dict[str, float]:
    """
    訓練 flat (Subject+Chapter 或 flat Section) 模式的小型 Transformer (BERT/ RoBERTa)，回傳 metrics。
    如果 use_dora=True，會在模型上套 DoRA Adapter（使用 LoraConfig with use_dora=True）。
    """
    # 1. 選擇 tokenizer & base_model
    if "roberta" in model_name:
        tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        base_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    else:
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
        base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # 2. 如果要用 DoRA，包成 PEFT 模型
    if use_dora:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=["query", "value"],  # 對 BERT/RoBERTa 模型，"query" 和 "value" 即可
            use_dora=True,                      # 啟用 DoRA
            #distillation_loss_weight=0.2,
            #retention_loss_weight=0.1
        )
        model = get_peft_model(base_model, peft_config)
    else:
        model = base_model

    model.to(device)

    # 3. 構造 Dataset
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=128, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=128, mode="flat_chapter")

    # 4. TrainingArguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5 if not use_dora else 3e-4,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_strategy="epoch"
    )

    # 5. 初始化 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        #label_names=[str(i) for i in range(num_labels)] #新增
    )

    # 6. 訓練 & 驗證
    start = time.time()
    trainer.train()
    elapsed = time.time() - start

    metrics = trainer.evaluate()

    # 7. 儲存 adapter（若有 DoRA）或整個模型
    #    如果 use_dora=True，Trainer.save_model() 會把 PEFT adapter 一併儲存到 output_dir
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # 8. 計算檔案大小
    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    metrics["eval_accuracy"],
        "precision":   metrics["eval_precision_macro"],
        "recall":      metrics["eval_recall_macro"],
        "f1":          metrics["eval_f1_macro"],
        "train_time":  elapsed,
        "model_size":  total_size
    }

def train_textcnn(
    vocab_size: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    tokenizer,
    num_labels: int,
    output_dir: str,
    device: torch.device
) -> Dict[str, float]:
    """
    訓練 TextCNN (flat_chapter) 模式，回傳 metrics。
    """
    max_len = 128
    batch_size = 32
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=max_len, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=max_len, mode="flat_chapter")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    embed_dim = 300
    model = TextCNN(vocab_size=vocab_size, embed_dim=embed_dim, num_classes=num_labels)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

    best_f1 = 0.0
    history = {"train_loss": [], "valid_loss": [], "train_f1": [], "valid_f1": []}

    for epoch in range(1, 6):
        model.train()
        total_loss = 0.0
        all_preds, all_labels = [], []
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            logits = model(input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * input_ids.size(0)
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())

        train_loss = total_loss / len(train_loader.dataset)
        _, _, train_f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average="macro", zero_division=0
        )

        model.eval()
        total_vloss = 0.0
        v_preds, v_labels = [], []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                logits = model(input_ids)
                loss = criterion(logits, labels)
                total_vloss += loss.item() * input_ids.size(0)
                preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
                v_preds.extend(preds)
                v_labels.extend(labels.detach().cpu().numpy())

        valid_loss = total_vloss / len(valid_loader.dataset)
        _, _, valid_f1, _ = precision_recall_fscore_support(
            v_labels, v_preds, average="macro", zero_division=0
        )

        scheduler.step()
        history["train_loss"].append(train_loss)
        history["valid_loss"].append(valid_loss)
        history["train_f1"].append(train_f1)
        history["valid_f1"].append(valid_f1)

        # 保存最佳模型
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            torch.save(model.state_dict(), f"{output_dir}/best_textcnn.pt")

        print(f"Epoch {epoch} | Train Loss {train_loss:.4f} | Valid Loss {valid_loss:.4f} | "
              f"Train F1 {train_f1:.4f} | Valid F1 {valid_f1:.4f}")

    # 存檔： tokenizer vocab
    tokenizer.save_pretrained(output_dir)

    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    None,
        "precision":   None,
        "recall":      None,
        "f1":          best_f1,
        "train_time":  None,
        "model_size":  total_size
    }

# =============================================================================
# 7. Pipeline 主程序：整合上述所有 case
# =============================================================================
def main_pipeline():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1. 載入資料與 Labels
    df, subj_map, chap_map, sect_map = load_and_prepare(["math", "science"]) # social
    label2id_subject, id2label_subject = subj_map
    label2id_chapter, id2label_chapter = chap_map
    label2id_section, id2label_section = sect_map

    # 2. 切 train/valid/test（90/10 → 再分81/9/10)
    base = df[["ques_detl", "label_subject", "label_chapter", "label_section", "subject"]].copy()

    # 先切出 10% 做最終 test
    rest, test = train_test_split(
        base, test_size=0.1, stratify=base["label_chapter"], random_state=42
    )

    # 再把 rest 切成 90/10 ≈ 81%/9% 做 train/valid
    train_df, valid_df = train_test_split(
        rest, test_size=0.1, stratify=rest["label_chapter"], random_state=42
    )

    # 取出 text 和各種 label lists
    train_texts = train_df["ques_detl"].tolist()
    valid_texts = valid_df["ques_detl"].tolist()
    test_texts  = test["ques_detl"].tolist()

    train_subj_labels = train_df["label_subject"].tolist()
    valid_subj_labels = valid_df["label_subject"].tolist()
    test_subj_labels  = test["label_subject"].tolist()

    train_chap_labels = train_df["label_chapter"].tolist()
    valid_chap_labels = valid_df["label_chapter"].tolist()
    test_chap_labels  = test["label_chapter"].tolist()

    train_sect_labels = train_df["label_section"].tolist()
    valid_sect_labels = valid_df["label_section"].tolist()
    test_sect_labels  = test["label_section"].tolist()

    # 3. 建立 tokenizer (給所有 TextCNN/Transformer 共用)
    bert_tok    = BertTokenizerFast.from_pretrained("bert-base-uncased")
    roberta_tok = RobertaTokenizerFast.from_pretrained("roberta-base")

    # 4. 建立詞彙大小 (TextCNN 用)
    vocab_size = bert_tok.vocab_size

    # 5. 實驗配置
    model_types = [
        #("DoRA_BERT",    "bert-base-uncased", True),
        ("DoRA_RoBERTa", "roberta-base",       True),
        #("BERT",         "bert-base-uncased", False),
        #("RoBERTa",      "roberta-base",       False),
    ]
    others = ["TextCNN", "MLP"]

    # 6. 三種分類策略
    strategies = [
        "flat_chapter",          # 直接分類 subject+chapter
        "flat_section_then_map", # 先分類 subject+chapter+section，再 map 回 chapter
        "hierarchical"           # 先分 subject，再分 chapter
    ]

    results = []

    for model_name, hf_model_name, use_dora in model_types:
        for strat in strategies:
            exp_name = f"{model_name}__{strat}"
            output_dir = f"./outputs/{exp_name}"
            os.makedirs(output_dir, exist_ok=True)

            print(f"\n▶開始實驗: {exp_name}")

            if strat == "flat_chapter":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_chapter),
                    train_texts=train_texts,
                    train_labels=train_chap_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_chap_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                results.append((exp_name, metrics))

            elif strat == "flat_section_then_map":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_section),
                    train_texts=train_texts,
                    train_labels=train_sect_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_sect_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                # 如需計算 chapter fine，須先批次推論再做 mapping
                results.append((exp_name, metrics))

            elif strat == "hierarchical":
                sub_output = os.path.join(output_dir, "subject_model")
                chap_output = os.path.join(output_dir, "chapter_model")
                os.makedirs(sub_output, exist_ok=True)
                os.makedirs(chap_output, exist_ok=True)

                # 1) 訓練 subject
                sub_metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_subject),
                    train_texts=train_texts,
                    train_labels=train_subj_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_subj_labels,
                    output_dir=sub_output,
                    device=device,
                    use_dora=use_dora
                )

                # 2) 針對每個 subject 訓練 chapter classifier
                chap_metrics = {}
                for subj_id, subj_name in id2label_subject.items():
                    idx_train = [i for i, s in enumerate(train_subj_labels) if s == subj_id]
                    idx_valid = [i for i, s in enumerate(valid_subj_labels) if s == subj_id]

                    sub_train_texts = [train_texts[i] for i in idx_train]
                    sub_train_chaps = [train_chap_labels[i] for i in idx_train]
                    sub_valid_texts = [valid_texts[i] for i in idx_valid]
                    sub_valid_chaps = [valid_chap_labels[i] for i in idx_valid]

                    unique_chaps = sorted({train_chap_labels[i] for i in idx_train})
                    if len(unique_chaps) < 2:
                        continue

                    sub_dir = os.path.join(chap_output, f"subj_{subj_id}")
                    os.makedirs(sub_dir, exist_ok=True)
                    sub_chap_m = train_flat_transformer(
                        model_name=hf_model_name,
                        num_labels=len(unique_chaps),
                        train_texts=sub_train_texts,
                        train_labels=sub_train_chaps,
                        valid_texts=sub_valid_texts,
                        valid_labels=sub_valid_chaps,
                        output_dir=sub_dir,
                        device=device,
                        use_dora=use_dora
                    )
                    chap_metrics[subj_id] = sub_chap_m

                results.append((exp_name, {"subject": sub_metrics, "chapter": chap_metrics}))

    # TextCNN + MLP 只負責 flat_chapter
    for model_name in others:
        strat = "flat_chapter"
        exp_name = f"{model_name}__{strat}"
        output_dir = f"./outputs/{exp_name}"
        os.makedirs(output_dir, exist_ok=True)
        print(f"\n▶開始實驗: {exp_name}")

        if model_name == "TextCNN":
            start = time.time()
            metrics = train_textcnn(
                vocab_size=vocab_size,
                train_texts=train_texts,
                train_labels=train_chap_labels,
                valid_texts=valid_texts,
                valid_labels=valid_chap_labels,
                tokenizer=bert_tok,
                num_labels=len(label2id_chapter),
                output_dir=output_dir,
                device=device
            )
            elapsed = time.time() - start
            metrics["train_time"] = elapsed
            results.append((exp_name, metrics))

        elif model_name == "MLP":
            train_enc = bert_tok(train_texts, padding=True, truncation=True, return_tensors="pt")
            valid_enc = bert_tok(valid_texts, padding=True, truncation=True, return_tensors="pt")

            bert_model = BertForSequenceClassification.from_pretrained(
                "bert-base-uncased", output_hidden_states=True
            )
            bert_model.to(device)
            bert_model.eval()

            def encode_avg(inputs):
                input_ids = inputs["input_ids"].to(device)
                attention_mask = inputs["attention_mask"].to(device)
                with torch.no_grad():
                    outputs = bert_model.bert(input_ids, attention_mask=attention_mask)
                    last_hidden = outputs.last_hidden_state  # (B, L, D)
                    mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
                    summed = torch.sum(last_hidden * mask, 1)
                    counts = torch.clamp(mask.sum(1), min=1e-9)
                    avg_pooled = summed / counts
                    return avg_pooled.cpu()

            train_feats = encode_avg(train_enc)
            valid_feats = encode_avg(valid_enc)

            input_dim = train_feats.size(1)
            hidden_dim = 256
            model = BasicMLP(input_dim, hidden_dim, num_classes=len(label2id_chapter))
            model.to(device)

            train_labels_tensor = torch.tensor(train_chap_labels)
            valid_labels_tensor = torch.tensor(valid_chap_labels)

            optimizer = optim.Adam(model.parameters(), lr=1e-4)
            criterion = nn.CrossEntropyLoss()

            start = time.time()
            best_f1 = 0.0
            for epoch in range(1, 6):
                model.train()
                optimizer.zero_grad()
                logits = model(train_feats.to(device))
                loss = criterion(logits, train_labels_tensor.to(device))
                loss.backward()
                optimizer.step()

                model.eval()
                with torch.no_grad():
                    v_logits = model(valid_feats.to(device))
                    v_preds = torch.argmax(v_logits, dim=1).cpu().numpy()
                    v_labels = valid_labels_tensor.numpy()
                    _, _, v_f1, _ = precision_recall_fscore_support(v_labels, v_preds, average="macro", zero_division=0)

                if v_f1 > best_f1:
                    best_f1 = v_f1
                    torch.save(model.state_dict(), f"{output_dir}/best_mlp.pt")
                print(f"Epoch {epoch} | Valid F1 {v_f1:.4f}")

            elapsed = time.time() - start
            total_size = 0
            for root, _, files in os.walk(output_dir):
                for fname in files:
                    total_size += os.path.getsize(os.path.join(root, fname))

            results.append((exp_name, {
                "accuracy":    None,
                "precision":   None,
                "recall":      None,
                "f1":          best_f1,
                "train_time":  elapsed,
                "model_size":  total_size
            }))

    # 8. 把所有實驗結果匯出
    out_df = pd.DataFrame([{"experiment": name, **metrics} for name, metrics in results])
    out_df.to_csv("experiment_results.csv", index=False)
    print("\n所有實驗完成，結果已存到 experiment_results.csv")

if __name__ == "__main__":
    main_pipeline()



▶開始實驗: DoRA_RoBERTa__flat_chapter


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,2.6415,1.824183,0.468938,0.425457,0.345104,0.331096
2,1.6069,1.364425,0.596526,0.524921,0.474058,0.470217
3,1.2697,1.174923,0.650635,0.570335,0.543957,0.534305
4,1.0876,1.056894,0.680027,0.583057,0.568988,0.566207
5,0.9739,1.013577,0.698063,0.642917,0.591359,0.59609
6,0.8878,0.964561,0.711423,0.629691,0.609841,0.609068
7,0.8024,0.932254,0.727455,0.644827,0.625918,0.628278
8,0.7567,0.910026,0.728791,0.647678,0.629052,0.630642



▶開始實驗: DoRA_RoBERTa__flat_section_then_map


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,3.8403,3.161893,0.267201,0.15312,0.133818,0.118461
2,2.9222,2.598913,0.382098,0.22627,0.21032,0.198403
3,2.3462,2.030571,0.480294,0.332805,0.310361,0.291222
4,1.904,1.780325,0.53507,0.401491,0.391488,0.37105
5,1.632,1.6057,0.57515,0.449016,0.436769,0.414048
6,1.4661,1.508339,0.590514,0.477029,0.458082,0.442144
7,1.3396,1.451455,0.611222,0.503476,0.480708,0.463024
8,1.2564,1.421235,0.621242,0.509115,0.490593,0.47342



▶開始實驗: DoRA_RoBERTa__hierarchical


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.1682,0.097383,0.96994,0.966574,0.970724,0.968538
2,0.0828,0.0945,0.970608,0.973685,0.964749,0.968814
3,0.0623,0.06212,0.9833,0.983501,0.981354,0.982403
4,0.045,0.07838,0.9833,0.983188,0.981665,0.982414
5,0.0376,0.09318,0.982632,0.983279,0.980186,0.981681
6,0.0321,0.113129,0.979292,0.980927,0.975587,0.978109
7,0.0242,0.113351,0.981296,0.982534,0.97816,0.980247
8,0.0236,0.108515,0.982632,0.983279,0.980186,0.981681


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,1.9449,1.268734,0.603939,0.629555,0.565879,0.557811
2,1.1727,0.978309,0.683807,0.680773,0.65238,0.656122
3,0.9096,0.811539,0.743982,0.736183,0.711048,0.718288
4,0.7597,0.731603,0.758206,0.755476,0.729023,0.735986
5,0.6607,0.668726,0.78884,0.781517,0.771958,0.771718
6,0.5873,0.652965,0.805252,0.807286,0.791714,0.791014
7,0.5369,0.62675,0.811816,0.814444,0.803152,0.806837
8,0.5062,0.620007,0.811816,0.814242,0.796511,0.802098


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [12]:
import os
import time
import torch
import random
import numpy as np
import pandas as pd

from typing import Dict, List
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType  # 只留 LoraConfig, get_peft_model, TaskType

# =============================================================================
# 1. 固定亂數種子，確保結果可重現
# =============================================================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# =============================================================================
# 2. 資料載入與 Label Processing
# =============================================================================
def load_and_prepare(subjects: List[str]) -> pd.DataFrame:
    """
    讀取多個 subject 的資料，合併後做 label encoding：
      - label_subject
      - label_chapter
      - label_section
    """
    def load_and_merge(subject: str) -> pd.DataFrame:
        base_path = f"{subject}_Database"
        qdf = pd.read_csv(f"{base_path}/{subject}_question_bank.csv")
        cdf = pd.read_csv(f"{base_path}/{subject}_chapter_list.csv")
        qdf.columns = qdf.columns.str.strip().str.lower().str.replace(" ", "_")
        cdf.columns = cdf.columns.str.strip().str.lower().str.replace(" ", "_")
        df = qdf.merge(cdf, on="section_name", how="left")
        df["subject"] = subject
        return df

    # 合併指定 subjects 的資料
    df = pd.concat([load_and_merge(s) for s in subjects], ignore_index=True)
    df = df[["subject", "chapter_name_x", "section_name", "ques_detl"]].dropna().reset_index(drop=True)

    # subject label
    df["label_str"] = df["subject"]
    label2id_subject = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_subject = {i: lab for lab, i in label2id_subject.items()}
    df["label_subject"] = df["label_str"].map(label2id_subject)

    # chapter label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"]
    label2id_chapter = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_chapter = {i: lab for lab, i in label2id_chapter.items()}
    df["label_chapter"] = df["label_str"].map(label2id_chapter)

    # section label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"] + "::" + df["section_name"]
    label2id_section = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_section = {i: lab for lab, i in label2id_section.items()}
    df["label_section"] = df["label_str"].map(label2id_section)

    # 只保留至少出現兩次的 section
    vc = df["label_section"].value_counts()
    valid_secs = set(vc[vc >= 2].index)
    df = df[df["label_section"].isin(valid_secs)].reset_index(drop=True)

    return df, (label2id_subject, id2label_subject), (label2id_chapter, id2label_chapter), (label2id_section, id2label_section)

# =============================================================================
# 3. 自訂 Dataset
# =============================================================================
class TextDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        tokenizer,
        max_len: int = 128,
        mode: str = "flat_chapter",  # 'flat_chapter', 'flat_section', 'hierarchical'
        subject_labels: List[int] = None
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        self.subject_labels = subject_labels  # 只有 hierarchical 模式才需要

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        if self.mode == "hierarchical":
            item["subject_labels"] = torch.tensor(self.subject_labels[idx], dtype=torch.long)
        return item

# =============================================================================
# 4. TextCNN Model 定義
# =============================================================================
class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        num_classes: int,
        kernel_sizes: List[int] = [3,4,5],
        num_filters: int = 100,
        dropout_p: float = 0.5
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids):
        """
        input_ids: (B, L)
        return: logits (B, num_classes)
        """
        x = self.embedding(input_ids)        # (B, L, D)
        x = x.permute(0, 2, 1)               # (B, D, L)
        conv_outs = []
        for conv in self.convs:
            c = conv(x)                      # (B, F, L - k + 1)
            c = torch.relu(c)                # (B, F, L - k + 1)
            c = torch.max_pool1d(c, kernel_size=c.size(2))  # (B, F, 1)
            conv_outs.append(c.squeeze(2))   # (B, F)
        cat = torch.cat(conv_outs, dim=1)    # (B, F * len(kernel_sizes))
        drop = self.dropout(cat)             # (B, F * len(kernel_sizes))
        logits = self.fc(drop)               # (B, num_classes)
        return logits

# =============================================================================
# 5. 基礎 MLP Model 定義 (只用於 flat 模式)
# =============================================================================
class BasicMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_classes: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, features):
        """
        features: (B, input_dim)
        return: logits (B, num_classes)
        """
        x = self.fc1(features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

# =============================================================================
# 6. 訓練與評估函式
# =============================================================================
def compute_metrics(preds_and_labels) -> Dict[str, float]:
    logits, labels = preds_and_labels
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }

def train_flat_transformer(
    model_name: str,
    num_labels: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    output_dir: str,
    device: torch.device,
    use_dora: bool = False
) -> Dict[str, float]:
    """
    訓練 flat (Chapter or Section) 模式的小型 Transformer (BERT/ RoBERTa)，回傳 metrics。
    如果 use_dora=True，會在模型上套 DoRA Adapter（LoraConfig）。
    """
    # 1. 選擇 tokenizer & base_model
    if "roberta" in model_name:
        tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        base_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    else:
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
        base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # 2. 如果要用 DoRA，包成 PEFT 模型
    if use_dora:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=["query", "value"],
            use_dora=True,
        )
        model = get_peft_model(base_model, peft_config)
    else:
        model = base_model

    model.to(device)

    # 3. 構造 Dataset
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=128, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=128, mode="flat_chapter")

    # 4. TrainingArguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5 if not use_dora else 3e-4,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_strategy="epoch"
    )

    # 5. 初始化 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    # 6. 訓練 & 驗證
    start = time.time()
    trainer.train()
    elapsed = time.time() - start

    metrics = trainer.evaluate()

    # 7. 儲存 adapter（若有 DoRA）或整個模型
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # 8. 計算檔案大小
    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    metrics["eval_accuracy"],
        "precision":   metrics["eval_precision"],
        "recall":      metrics["eval_recall"],
        "f1":          metrics["eval_f1"],
        "train_time":  elapsed,
        "model_size":  total_size
    }

def train_textcnn(
    vocab_size: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    tokenizer,
    num_labels: int,
    output_dir: str,
    device: torch.device
) -> Dict[str, float]:
    """
    訓練 TextCNN (flat 模式)，回傳 metrics (accuracy, precision, recall, f1, train_time, model_size)。
    """
    max_len = 128
    batch_size = 32
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=max_len, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=max_len, mode="flat_chapter")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    embed_dim = 300
    model = TextCNN(vocab_size=vocab_size, embed_dim=embed_dim, num_classes=num_labels)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

    best_f1 = 0.0
    best_metrics = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
    start_time = time.time()

    for epoch in range(1, 6):
        model.train()
        total_loss = 0.0
        all_preds, all_labels = [], []
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            logits = model(input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * input_ids.size(0)
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())

        train_loss = total_loss / len(train_loader.dataset)
        train_acc = accuracy_score(all_labels, all_preds)
        train_prec, train_rec, train_f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average="macro", zero_division=0
        )

        model.eval()
        total_vloss = 0.0
        v_preds, v_labels = [], []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                logits = model(input_ids)
                loss = criterion(logits, labels)
                total_vloss += loss.item() * input_ids.size(0)
                preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
                v_preds.extend(preds)
                v_labels.extend(labels.detach().cpu().numpy())

        valid_loss = total_vloss / len(valid_loader.dataset)
        valid_acc = accuracy_score(v_labels, v_preds)
        valid_prec, valid_rec, valid_f1, _ = precision_recall_fscore_support(
            v_labels, v_preds, average="macro", zero_division=0
        )

        scheduler.step()

        if valid_f1 > best_f1:
            best_f1 = valid_f1
            best_metrics = {
                "accuracy":  valid_acc,
                "precision": valid_prec,
                "recall":    valid_rec,
                "f1":        valid_f1
            }
            # 存模型前先確保 output_dir 存在
            os.makedirs(output_dir, exist_ok=True)
            torch.save(model.state_dict(), f"{output_dir}/best_textcnn.pt")

        print(
            f"[TextCNN] Epoch {epoch} | "
            f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f} | "
            f"Valid Loss: {valid_loss:.4f} | Valid Acc: {valid_acc:.4f} | Valid F1: {valid_f1:.4f}"
        )

    elapsed = time.time() - start_time

    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    best_metrics["accuracy"],
        "precision":   best_metrics["precision"],
        "recall":      best_metrics["recall"],
        "f1":          best_metrics["f1"],
        "train_time":  elapsed,
        "model_size":  total_size
    }

# =============================================================================
# 7. Pipeline 主程序：整合上述所有 case
# =============================================================================
def main_pipeline():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1. 載入資料與 Labels
    df, subj_map, chap_map, sect_map = load_and_prepare(["math", "science"]) # 社會科改成合適的 subject
    label2id_subject, id2label_subject = subj_map
    label2id_chapter, id2label_chapter = chap_map
    label2id_section, id2label_section = sect_map

    # 2. 切 train/valid/test（90/10 → 再分81/9/10)
    base = df[["ques_detl", "label_subject", "label_chapter", "label_section", "subject"]].copy()

    # 先切出 10% 做最終 test
    rest, test = train_test_split(
        base, test_size=0.1, stratify=base["label_chapter"], random_state=42
    )

    # 再把 rest 切成 90/10 ≈ 81%/9% 做 train/valid
    train_df, valid_df = train_test_split(
        rest, test_size=0.1, stratify=rest["label_chapter"], random_state=42
    )

    # 取出 text 和各種 label lists
    train_texts = train_df["ques_detl"].tolist()
    valid_texts = valid_df["ques_detl"].tolist()
    test_texts  = test["ques_detl"].tolist()

    train_subj_labels = train_df["label_subject"].tolist()
    valid_subj_labels = valid_df["label_subject"].tolist()
    test_subj_labels  = test["label_subject"].tolist()

    train_chap_labels = train_df["label_chapter"].tolist()
    valid_chap_labels = valid_df["label_chapter"].tolist()
    test_chap_labels  = test["label_chapter"].tolist()

    train_sect_labels = train_df["label_section"].tolist()
    valid_sect_labels = valid_df["label_section"].tolist()
    test_sect_labels  = test["label_section"].tolist()

    # 3. 建立 tokenizer (給所有 TextCNN/Transformer 共用)
    bert_tok    = BertTokenizerFast.from_pretrained("bert-base-uncased")
    roberta_tok = RobertaTokenizerFast.from_pretrained("roberta-base")

    # 4. 建立詞彙大小 (TextCNN 用)
    vocab_size = bert_tok.vocab_size

    # 5. 實驗配置
    model_types = [
        # 範例：("BERT", "bert-base-uncased", False),
        #        ("RoBERTa", "roberta-base", False),
        #        ("DoRA_BERT", "bert-base-uncased", True),
        #        ("DoRA_RoBERTa", "roberta-base", True),
    ]
    others = ["TextCNN", "MLP"]

    # 6. 三種分類策略
    strategies = [
        "flat_chapter",          # 直接分類 subject+chapter
        "flat_section",          # 直接分類 subject+chapter+section
        "hierarchical"           # 先分 subject，再分 chapter
    ]

    results = []

    # (A) 先跑 Transformer 部分 (示例)
    for model_name, hf_model_name, use_dora in model_types:
        for strat in strategies:
            exp_name = f"{model_name}__{strat}"
            output_dir = f"./outputs/{exp_name}"
            os.makedirs(output_dir, exist_ok=True)

            print(f"\n▶開始實驗 (Transformer): {exp_name}")

            if strat == "flat_chapter":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_chapter),
                    train_texts=train_texts,
                    train_labels=train_chap_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_chap_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                results.append((exp_name, metrics))

            elif strat == "flat_section":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_section),
                    train_texts=train_texts,
                    train_labels=train_sect_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_sect_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                results.append((exp_name, metrics))

            elif strat == "hierarchical":
                # hierarchical: 先訓練 subject，再分 subj_id 針對 chapter
                sub_output = os.path.join(output_dir, "subj_model")
                chap_output = os.path.join(output_dir, "chapter_model")
                os.makedirs(sub_output, exist_ok=True)
                os.makedirs(chap_output, exist_ok=True)

                print("  [Hierarchical] Step 1: 訓練 subject 模型")
                sub_metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_subject),
                    train_texts=train_texts,
                    train_labels=train_subj_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_subj_labels,
                    output_dir=sub_output,
                    device=device,
                    use_dora=use_dora
                )

                chap_metrics = {}
                print("  [Hierarchical] Step 2: 針對每個 subject 訓練 chapter 分類器")
                for subj_id, subj_name in id2label_subject.items():
                    idx_train = [i for i, s in enumerate(train_subj_labels) if s == subj_id]
                    idx_valid = [i for i, s in enumerate(valid_subj_labels) if s == subj_id]

                    sub_train_texts = [train_texts[i] for i in idx_train]
                    sub_train_chaps = [train_chap_labels[i] for i in idx_train]
                    sub_valid_texts = [valid_texts[i] for i in idx_valid]
                    sub_valid_chaps = [valid_chap_labels[i] for i in idx_valid]

                    unique_chaps = sorted({train_chap_labels[i] for i in idx_train})
                    if len(unique_chaps) < 2:
                        continue

                    sub_dir = os.path.join(chap_output, f"subj_{subj_id}")
                    os.makedirs(sub_dir, exist_ok=True)

                    print(f"    [Subject {subj_id}] Training chapter classifier (num_labels={len(unique_chaps)})")
                    sub_chap_m = train_flat_transformer(
                        model_name=hf_model_name,
                        num_labels=len(unique_chaps),
                        train_texts=sub_train_texts,
                        train_labels=sub_train_chaps,
                        valid_texts=sub_valid_texts,
                        valid_labels=sub_valid_chaps,
                        output_dir=sub_dir,
                        device=device,
                        use_dora=use_dora
                    )
                    chap_metrics[subj_id] = sub_chap_m

                results.append((exp_name, {"subject_metrics": sub_metrics, "chapter_metrics": chap_metrics}))

    # =============================================================================
    # (B) TextCNN + MLP 只負責三種策略 (flat_chapter, flat_section, hierarchical)
    # =============================================================================
    for model_name in others:
        for strat in strategies:
            exp_name = f"{model_name}__{strat}"
            output_dir = f"./outputs/{exp_name}"
            os.makedirs(output_dir, exist_ok=True)
            print(f"\n▶開始實驗 (TextCNN/MLP): {exp_name}")

            # 如果是 hierarchical，先在這裡幫它建好 subdir
            if strat == "hierarchical":
                os.makedirs(os.path.join(output_dir, "subj_model"), exist_ok=True)
                os.makedirs(os.path.join(output_dir, "chapter_model"), exist_ok=True)

            if strat == "flat_chapter":
                cur_train_labels = train_chap_labels
                cur_valid_labels = valid_chap_labels
                num_labels = len(label2id_chapter)

            elif strat == "flat_section":
                cur_train_labels = train_sect_labels
                cur_valid_labels = valid_sect_labels
                num_labels = len(label2id_section)

            elif strat == "hierarchical":
                cur_results = {"subject": {}, "chapter": {}}

                # Step1: 訓練 subject 分類器
                print("  [Hierarchical] Step1: 訓練 subject 分類器")
                # 先確保 subj_model 存在
                subj_model_dir = os.path.join(output_dir, "subj_model")
                os.makedirs(subj_model_dir, exist_ok=True)

                if model_name == "TextCNN":
                    # 用 chapter labels 當作 proxy 來當作 subject 分類（TextCNN 只支援 flat_chapter）
                    sub_metrics = train_textcnn(
                        vocab_size=vocab_size,
                        train_texts=train_texts,
                        train_labels=train_subj_labels,
                        valid_texts=valid_texts,
                        valid_labels=valid_subj_labels,
                        tokenizer=bert_tok,
                        num_labels=len(label2id_subject),
                        output_dir=subj_model_dir,
                        device=device
                    )
                else:  # MLP 版 subject
                    enc_train = bert_tok(train_texts, padding=True, truncation=True, return_tensors="pt")
                    enc_valid = bert_tok(valid_texts, padding=True, truncation=True, return_tensors="pt")

                    bert_model = BertForSequenceClassification.from_pretrained(
                        "bert-base-uncased", output_hidden_states=True
                    ).to(device)
                    bert_model.eval()

                    def encode_avg(inputs, batch_size=32):
                        input_ids = inputs["input_ids"]
                        attention_mask = inputs["attention_mask"]
                        feats = []
                        with torch.no_grad():
                            for i in range(0, input_ids.size(0), batch_size):
                                end_i = min(i + batch_size, input_ids.size(0))
                                b_ids = input_ids[i:end_i].to(device)
                                b_att = attention_mask[i:end_i].to(device)
                                outs = bert_model.bert(b_ids, attention_mask=b_att).last_hidden_state
                                mask = b_att.unsqueeze(-1).expand_as(outs).float()
                                summed = torch.sum(outs * mask, dim=1)
                                counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                                avg = (summed / counts).cpu()
                                feats.append(avg)
                                del b_ids, b_att, outs, mask, summed, counts
                                torch.cuda.empty_cache()
                        return torch.cat(feats, dim=0)

                    subj_train_feats = encode_avg(enc_train)
                    subj_valid_feats = encode_avg(enc_valid)

                    subj_model = BasicMLP(subj_train_feats.size(1), 256, len(label2id_subject)).to(device)
                    subj_train_ds = TensorDataset(subj_train_feats, torch.tensor(train_subj_labels))
                    subj_valid_ds = TensorDataset(subj_valid_feats, torch.tensor(valid_subj_labels))
                    subj_train_loader = DataLoader(subj_train_ds, batch_size=64, shuffle=True)
                    subj_valid_loader = DataLoader(subj_valid_ds, batch_size=64, shuffle=False)

                    opt = optim.Adam(subj_model.parameters(), lr=1e-4)
                    crit = nn.CrossEntropyLoss()

                    best_f1_s = 0.0
                    best_metrics_s = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
                    st = time.time()
                    for epoch in range(1, 6):
                        subj_model.train()
                        train_preds, train_labels_acc = [], []
                        run_loss = 0.0
                        for xb, yb in subj_train_loader:
                            xb, yb = xb.to(device), yb.to(device)
                            opt.zero_grad()
                            logits = subj_model(xb)
                            loss = crit(logits, yb)
                            loss.backward()
                            opt.step()
                            run_loss += loss.item() * xb.size(0)
                            pr = torch.argmax(logits, dim=1).cpu().numpy()
                            train_preds.extend(pr)
                            train_labels_acc.extend(yb.cpu().numpy())

                        train_acc_s = accuracy_score(train_labels_acc, train_preds)
                        _, _, train_f1_s, _ = precision_recall_fscore_support(
                            train_labels_acc, train_preds, average="macro", zero_division=0
                        )

                        subj_model.eval()
                        val_preds_s, val_labels_s = [], []
                        vloss_s = 0.0
                        with torch.no_grad():
                            for xb, yb in subj_valid_loader:
                                xb, yb = xb.to(device), yb.to(device)
                                logits = subj_model(xb)
                                loss = crit(logits, yb)
                                vloss_s += loss.item() * xb.size(0)
                                pr = torch.argmax(logits, dim=1).cpu().numpy()
                                val_preds_s.extend(pr)
                                val_labels_s.extend(yb.cpu().numpy())

                        valid_acc_s = accuracy_score(val_labels_s, val_preds_s)
                        _, _, valid_f1_s, _ = precision_recall_fscore_support(
                            val_labels_s, val_preds_s, average="macro", zero_division=0
                        )

                        if valid_f1_s > best_f1_s:
                            best_f1_s = valid_f1_s
                            best_metrics_s = {
                                "accuracy":  valid_acc_s,
                                "precision": precision_score(val_labels_s, val_preds_s, average="macro", zero_division=0),
                                "recall":    recall_score(val_labels_s, val_preds_s, average="macro", zero_division=0),
                                "f1":        valid_f1_s
                            }
                            # 確保子目錄存在
                            save_dir_s = os.path.join(output_dir, "subj_model")
                            os.makedirs(save_dir_s, exist_ok=True)
                            torch.save(subj_model.state_dict(), os.path.join(save_dir_s, "best_subj_mlp.pt"))

                        print(
                            f"    [Subject MLP Epoch {epoch}] "
                            f"Train Acc: {train_acc_s:.4f} | Train F1: {train_f1_s:.4f} | "
                            f"Valid Acc: {valid_acc_s:.4f} | Valid F1: {valid_f1_s:.4f}"
                        )

                    elapsed_s = time.time() - st
                    total_sz_s = 0
                    save_dir_s = os.path.join(output_dir, "subj_model")
                    for root, _, files in os.walk(save_dir_s):
                        for fname in files:
                            total_sz_s += os.path.getsize(os.path.join(root, fname))
                    sub_metrics = {
                        "accuracy":    best_metrics_s["accuracy"],
                        "precision":   best_metrics_s["precision"],
                        "recall":      best_metrics_s["recall"],
                        "f1":          best_metrics_s["f1"],
                        "train_time":  elapsed_s,
                        "model_size":  total_sz_s
                    }

                cur_results["subject"][model_name] = sub_metrics  # TextCNN 或 MLP 的 subject 部分

                # Step2: 針對每個 subject 訓練 chapter 分類器
                print("  [Hierarchical] Step2: 針對每個 subject 訓練 chapter 分類器")
                for subj_id, subj_name in id2label_subject.items():
                    idx_train = [i for i, s in enumerate(train_subj_labels) if s == subj_id]
                    idx_valid = [i for i, s in enumerate(valid_subj_labels) if s == subj_id]

                    sub_train_texts = [train_texts[i] for i in idx_train]
                    sub_train_chaps = [train_chap_labels[i] for i in idx_train]
                    sub_valid_texts = [valid_texts[i] for i in idx_valid]
                    sub_valid_chaps = [valid_chap_labels[i] for i in idx_valid]

                    unique_chaps = sorted({train_chap_labels[i] for i in idx_train})
                    if len(unique_chaps) < 2:
                        continue

                    chap_dir = os.path.join(output_dir, "chapter_model", f"subj_{subj_id}")
                    os.makedirs(chap_dir, exist_ok=True)

                    if model_name == "TextCNN":
                        chap_m = train_textcnn(
                            vocab_size=vocab_size,
                            train_texts=sub_train_texts,
                            train_labels=sub_train_chaps,
                            valid_texts=sub_valid_texts,
                            valid_labels=sub_valid_chaps,
                            tokenizer=bert_tok,
                            num_labels=len(unique_chaps),
                            output_dir=chap_dir,
                            device=device
                        )
                    else:  # MLP
                        enc_tr = bert_tok(sub_train_texts, padding=True, truncation=True, return_tensors="pt")
                        enc_vd = bert_tok(sub_valid_texts, padding=True, truncation=True, return_tensors="pt")
                        # 載入 BERT 作特徵擷取
                        bert_model = BertForSequenceClassification.from_pretrained(
                            "bert-base-uncased", output_hidden_states=True
                        ).to(device)
                        bert_model.eval()

                        def encode_avg(inputs, batch_size=32):
                            input_ids = inputs["input_ids"]
                            attention_mask = inputs["attention_mask"]
                            feats = []
                            with torch.no_grad():
                                for i in range(0, input_ids.size(0), batch_size):
                                    end_i = min(i + batch_size, input_ids.size(0))
                                    b_ids = input_ids[i:end_i].to(device)
                                    b_att = attention_mask[i:end_i].to(device)
                                    outs = bert_model.bert(b_ids, attention_mask=b_att).last_hidden_state
                                    mask = b_att.unsqueeze(-1).expand_as(outs).float()
                                    summed = torch.sum(outs * mask, dim=1)
                                    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                                    avg = (summed / counts).cpu()
                                    feats.append(avg)
                                    del b_ids, b_att, outs, mask, summed, counts
                                    torch.cuda.empty_cache()
                            return torch.cat(feats, dim=0)

                        tr_feats = encode_avg(enc_tr)
                        vd_feats = encode_avg(enc_vd)

                        mlp_model = BasicMLP(tr_feats.size(1), 256, len(unique_chaps)).to(device)
                        tr_ds = TensorDataset(tr_feats, torch.tensor(sub_train_chaps))
                        vd_ds = TensorDataset(vd_feats, torch.tensor(sub_valid_chaps))
                        tr_loader = DataLoader(tr_ds, batch_size=64, shuffle=True)
                        vd_loader = DataLoader(vd_ds, batch_size=64, shuffle=False)

                        opt_mc = optim.Adam(mlp_model.parameters(), lr=1e-4)
                        crit_mc = nn.CrossEntropyLoss()

                        best_f1_c = 0.0
                        best_metrics_c = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
                        st_c = time.time()
                        for epoch in range(1, 6):
                            mlp_model.train()
                            preds_tr, labels_tr = [], []
                            runloss = 0.0
                            for xb, yb in tr_loader:
                                xb, yb = xb.to(device), yb.to(device)
                                opt_mc.zero_grad()
                                logits = mlp_model(xb)
                                loss = crit_mc(logits, yb)
                                loss.backward()
                                opt_mc.step()
                                runloss += loss.item() * xb.size(0)
                                pr = torch.argmax(logits, dim=1).cpu().numpy()
                                preds_tr.extend(pr)
                                labels_tr.extend(yb.cpu().numpy())
                            acc_tr_c = accuracy_score(labels_tr, preds_tr)
                            _, _, f1_tr_c, _ = precision_recall_fscore_support(
                                labels_tr, preds_tr, average="macro", zero_division=0
                            )

                            mlp_model.eval()
                            preds_vd, labels_vd = [], []
                            vl = 0.0
                            with torch.no_grad():
                                for xb, yb in vd_loader:
                                    xb, yb = xb.to(device), yb.to(device)
                                    logits = mlp_model(xb)
                                    loss = crit_mc(logits, yb)
                                    vl += loss.item() * xb.size(0)
                                    pr = torch.argmax(logits, dim=1).cpu().numpy()
                                    preds_vd.extend(pr)
                                    labels_vd.extend(yb.cpu().numpy())
                            acc_vd_c = accuracy_score(labels_vd, preds_vd)
                            _, _, f1_vd_c, _ = precision_recall_fscore_support(
                                labels_vd, preds_vd, average="macro", zero_division=0
                            )
                            if f1_vd_c > best_f1_c:
                                best_f1_c = f1_vd_c
                                best_metrics_c = {
                                    "accuracy":  acc_vd_c,
                                    "precision": precision_score(labels_vd, preds_vd, average="macro", zero_division=0),
                                    "recall":    recall_score(labels_vd, preds_vd, average="macro", zero_division=0),
                                    "f1":        f1_vd_c
                                }
                                # 確保 chap_dir 存在
                                os.makedirs(chap_dir, exist_ok=True)
                                torch.save(mlp_model.state_dict(), os.path.join(chap_dir, "best_chap_mlp.pt"))

                            print(
                                f"    [Chapter MLP subj_{subj_id} Epoch {epoch}] "
                                f"Train Acc: {acc_tr_c:.4f} | Train F1: {f1_tr_c:.4f} | "
                                f"Valid Acc: {acc_vd_c:.4f} | Valid F1: {f1_vd_c:.4f}"
                            )

                        elapsed_c = time.time() - st_c
                        total_sz_c = 0
                        for root, _, files in os.walk(chap_dir):
                            for fname in files:
                                total_sz_c += os.path.getsize(os.path.join(root, fname))

                        chap_m = {
                            "accuracy":    best_metrics_c["accuracy"],
                            "precision":   best_metrics_c["precision"],
                            "recall":      best_metrics_c["recall"],
                            "f1":          best_metrics_c["f1"],
                            "train_time":  elapsed_c,
                            "model_size":  total_sz_c
                        }

                    cur_results["chapter"][subj_id] = chap_m

                results.append((exp_name, cur_results))
                continue  # 跳過後面的 flat 處理

            # 以下為 flat_chapter 或 flat_section 的 TextCNN/MLP 處理
            if strat in ["flat_chapter", "flat_section"]:
                if strat == "flat_chapter":
                    cur_train_labels = train_chap_labels
                    cur_valid_labels = valid_chap_labels
                    num_labels = len(label2id_chapter)
                else:  # flat_section
                    cur_train_labels = train_sect_labels
                    cur_valid_labels = valid_sect_labels
                    num_labels = len(label2id_section)

                if model_name == "TextCNN":
                    print("  [TextCNN] Step1: 開始訓練")
                    start = time.time()

                    metrics = train_textcnn(
                        vocab_size=vocab_size,
                        train_texts=train_texts,
                        train_labels=cur_train_labels,
                        valid_texts=valid_texts,
                        valid_labels=cur_valid_labels,
                        tokenizer=bert_tok,
                        num_labels=num_labels,
                        output_dir=output_dir,
                        device=device
                    )
                    elapsed = time.time() - start
                    metrics["train_time"] = elapsed

                    print(f"    → accuracy:  {metrics['accuracy']:.4f}")
                    print(f"    → precision: {metrics['precision']:.4f}")
                    print(f"    → recall:    {metrics['recall']:.4f}")
                    print(f"    → f1:        {metrics['f1']:.4f}")
                    print(f"    → model_size: {metrics['model_size']} bytes")

                    results.append((exp_name, metrics))

                else:  # MLP
                    print("  [MLP] Step1: 用 tokenizer 編碼到 CPU")
                    train_enc = bert_tok(
                        train_texts,
                        padding=True,
                        truncation=True,
                        return_tensors="pt"
                    )
                    valid_enc = bert_tok(
                        valid_texts,
                        padding=True,
                        truncation=True,
                        return_tensors="pt"
                    )
                    print(f"    → train_enc.shape: {train_enc['input_ids'].shape}")
                    print(f"    → valid_enc.shape: {train_enc['input_ids'].shape}")

                    print("  [MLP] Step2: 載入 BERT 做特徵擷取")
                    bert_model = BertForSequenceClassification.from_pretrained(
                        "bert-base-uncased", output_hidden_states=True
                    ).to(device)
                    bert_model.eval()

                    def encode_avg_dataset(inputs, batch_size: int = 32):
                        input_ids = inputs["input_ids"]
                        attention_mask = inputs["attention_mask"]
                        features_list = []

                        with torch.no_grad():
                            for i in range(0, input_ids.size(0), batch_size):
                                end = min(i + batch_size, input_ids.size(0))
                                b_ids = input_ids[i:end].to(device)
                                b_att = attention_mask[i:end].to(device)
                                outs = bert_model.bert(b_ids, attention_mask=b_att).last_hidden_state
                                mask = b_att.unsqueeze(-1).expand_as(outs).float()
                                summed = torch.sum(outs * mask, dim=1)
                                counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                                avg = (summed / counts).cpu()
                                features_list.append(avg)
                                del b_ids, b_att, outs, mask, summed, counts
                                torch.cuda.empty_cache()
                        return torch.cat(features_list, dim=0)

                    print("  [MLP] Step3: 計算 train_feats")
                    train_feats = encode_avg_dataset(train_enc, batch_size=32)
                    print(f"    → train_feats.shape = {train_feats.shape}")

                    print("  [MLP] Step4: 計算 valid_feats")
                    valid_feats = encode_avg_dataset(valid_enc, batch_size=32)
                    print(f"    → valid_feats.shape = {valid_feats.shape}")

                    print("  [MLP] Step5: 建立 MLP 並訓練")
                    input_dim = train_feats.size(1)
                    hidden_dim = 256
                    model = BasicMLP(input_dim, hidden_dim, num_labels).to(device)

                    train_labels_tensor = torch.tensor(cur_train_labels, dtype=torch.long)
                    valid_labels_tensor = torch.tensor(cur_valid_labels, dtype=torch.long)

                    train_ds = TensorDataset(train_feats, train_labels_tensor)
                    valid_ds = TensorDataset(valid_feats, valid_labels_tensor)
                    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
                    valid_loader = DataLoader(valid_ds, batch_size=64, shuffle=False)

                    optimizer = optim.Adam(model.parameters(), lr=1e-4)
                    criterion = nn.CrossEntropyLoss()

                    start = time.time()
                    best_f1 = 0.0
                    best_metrics = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

                    for epoch in range(1, 6):
                        model.train()
                        epoch_loss = 0.0
                        train_preds, train_labels_acc = [], []

                        for feats_batch, labels_batch in train_loader:
                            feats_batch = feats_batch.to(device)
                            labels_batch = labels_batch.to(device)

                            optimizer.zero_grad()
                            logits = model(feats_batch)
                            loss = criterion(logits, labels_batch)
                            loss.backward()
                            optimizer.step()

                            epoch_loss += loss.item() * feats_batch.size(0)
                            preds = torch.argmax(logits, dim=1).cpu().numpy()
                            train_preds.extend(preds)
                            train_labels_acc.extend(labels_batch.cpu().numpy())

                        train_acc = accuracy_score(train_labels_acc, train_preds)
                        train_prec, train_rec, train_f1, _ = precision_recall_fscore_support(
                            train_labels_acc, train_preds, average="macro", zero_division=0
                        )
                        avg_train_loss = epoch_loss / len(train_loader.dataset)

                        model.eval()
                        valid_preds, valid_labels_acc = [], []
                        vloss = 0.0
                        with torch.no_grad():
                            for feats_batch, labels_batch in valid_loader:
                                feats_batch = feats_batch.to(device)
                                labels_batch = labels_batch.to(device)
                                logits = model(feats_batch)
                                loss = criterion(logits, labels_batch)
                                vloss += loss.item() * feats_batch.size(0)
                                preds = torch.argmax(logits, dim=1).cpu().numpy()
                                valid_preds.extend(preds)
                                valid_labels_acc.extend(labels_batch.cpu().numpy())

                        valid_acc = accuracy_score(valid_labels_acc, valid_preds)
                        valid_prec, valid_rec, valid_f1, _ = precision_recall_fscore_support(
                            valid_labels_acc, valid_preds, average="macro", zero_division=0
                        )

                        if valid_f1 > best_f1:
                            best_f1 = valid_f1
                            best_metrics = {
                                "accuracy":  valid_acc,
                                "precision": valid_prec,
                                "recall":    valid_rec,
                                "f1":        valid_f1
                            }
                            # 確保 output_dir 存在
                            os.makedirs(output_dir, exist_ok=True)
                            torch.save(model.state_dict(), f"{output_dir}/best_mlp.pt")

                        print(
                            f"[MLP Epoch {epoch}] "
                            f"Train Loss: {avg_train_loss:.4f} | "
                            f"Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f} | "
                            f"Valid Acc: {valid_acc:.4f} | Valid F1: {valid_f1:.4f}"
                        )

                    elapsed = time.time() - start
                    total_size = 0
                    for root, _, files in os.walk(output_dir):
                        for fname in files:
                            total_size += os.path.getsize(os.path.join(root, fname))

                    best_metrics["train_time"] = elapsed
                    best_metrics["model_size"] = total_size
                    print(f"    → accuracy:  {best_metrics['accuracy']:.4f}")
                    print(f"    → precision: {best_metrics['precision']:.4f}")
                    print(f"    → recall:    {best_metrics['recall']:.4f}")
                    print(f"    → f1:        {best_metrics['f1']:.4f}")
                    print(f"    → model_size: {best_metrics['model_size']} bytes")

                    results.append((exp_name, best_metrics))

    # =============================================================================
    # 8. 把所有實驗結果匯出
    # =============================================================================
    out_df = pd.DataFrame([{"experiment": name, **metrics} for name, metrics in results])
    out_df.to_csv("experiment_results.csv", index=False)
    print("\n所有實驗完成，結果已存到 experiment_results.csv")

if __name__ == "__main__":
    main_pipeline()



▶開始實驗 (TextCNN/MLP): TextCNN__flat_chapter
  [TextCNN] Step1: 開始訓練
[TextCNN] Epoch 1 | Train Loss: 3.3066 | Train Acc: 0.1835 | Train F1: 0.0907 | Valid Loss: 2.5390 | Valid Acc: 0.3721 | Valid F1: 0.2135
[TextCNN] Epoch 2 | Train Loss: 2.4418 | Train Acc: 0.3646 | Train F1: 0.2351 | Valid Loss: 2.0105 | Valid Acc: 0.4830 | Valid F1: 0.3436
[TextCNN] Epoch 3 | Train Loss: 2.0279 | Train Acc: 0.4582 | Train F1: 0.3359 | Valid Loss: 1.7285 | Valid Acc: 0.5458 | Valid F1: 0.4022
[TextCNN] Epoch 4 | Train Loss: 1.7774 | Train Acc: 0.5206 | Train F1: 0.4002 | Valid Loss: 1.6180 | Valid Acc: 0.5731 | Valid F1: 0.4379
[TextCNN] Epoch 5 | Train Loss: 1.6678 | Train Acc: 0.5443 | Train F1: 0.4231 | Valid Loss: 1.5335 | Valid Acc: 0.5959 | Valid F1: 0.4621
    → accuracy:  0.5959
    → precision: 0.4931
    → recall:    0.4644
    → f1:        0.4621
    → model_size: 39089975 bytes

▶開始實驗 (TextCNN/MLP): TextCNN__flat_section
  [TextCNN] Step1: 開始訓練
[TextCNN] Epoch 1 | Train Loss: 4.5078 | Trai

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [5]:
import os
import time
import torch
import random
import numpy as np
import pandas as pd

from typing import Dict, List
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType  # 只留 LoraConfig, get_peft_model, TaskType

# =============================================================================
# 1. 固定亂數種子，確保結果可重現
# =============================================================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# =============================================================================
# 2. 資料載入與 Label Processing
# =============================================================================
def load_and_prepare(subjects: List[str]) -> pd.DataFrame:
    """
    讀取多個 subject 的資料，合併後做 label encoding：
      - label_subject
      - label_chapter
      - label_section
    """
    def load_and_merge(subject: str) -> pd.DataFrame:
        base_path = f"{subject}_Database"
        qdf = pd.read_csv(f"{base_path}/{subject}_question_bank.csv")
        cdf = pd.read_csv(f"{base_path}/{subject}_chapter_list.csv")
        qdf.columns = qdf.columns.str.strip().str.lower().str.replace(" ", "_")
        cdf.columns = cdf.columns.str.strip().str.lower().str.replace(" ", "_")
        df = qdf.merge(cdf, on="section_name", how="left")
        df["subject"] = subject
        return df

    # 合併指定 subjects 的資料
    df = pd.concat([load_and_merge(s) for s in subjects], ignore_index=True)
    df = df[["subject", "chapter_name_x", "section_name", "ques_detl"]].dropna().reset_index(drop=True)

    # subject label
    df["label_str"] = df["subject"]
    label2id_subject = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_subject = {i: lab for lab, i in label2id_subject.items()}
    df["label_subject"] = df["label_str"].map(label2id_subject)

    # chapter label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"]
    label2id_chapter = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_chapter = {i: lab for lab, i in label2id_chapter.items()}
    df["label_chapter"] = df["label_str"].map(label2id_chapter)

    # section label
    df["label_str"] = df["subject"] + "::" + df["chapter_name_x"] + "::" + df["section_name"]
    label2id_section = {lab: i for i, lab in enumerate(sorted(df["label_str"].unique()))}
    id2label_section = {i: lab for lab, i in label2id_section.items()}
    df["label_section"] = df["label_str"].map(label2id_section)

    # 只保留至少出現兩次的 section
    vc = df["label_section"].value_counts()
    valid_secs = set(vc[vc >= 2].index)
    df = df[df["label_section"].isin(valid_secs)].reset_index(drop=True)

    return df, (label2id_subject, id2label_subject), (label2id_chapter, id2label_chapter), (label2id_section, id2label_section)

# =============================================================================
# 3. 自訂 Dataset
# =============================================================================
class TextDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        tokenizer,
        max_len: int = 128,
        mode: str = "flat_chapter",  # 'flat_chapter', 'flat_section', 'hierarchical'
        subject_labels: List[int] = None
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        self.subject_labels = subject_labels  # 只有 hierarchical 模式才需要

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        if self.mode == "hierarchical":
            item["subject_labels"] = torch.tensor(self.subject_labels[idx], dtype=torch.long)
        return item

# =============================================================================
# 4. TextCNN Model 定義
# =============================================================================
class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        num_classes: int,
        kernel_sizes: List[int] = [3,4,5],
        num_filters: int = 100,
        dropout_p: float = 0.5
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids):
        """
        input_ids: (B, L)
        return: logits (B, num_classes)
        """
        x = self.embedding(input_ids)        # (B, L, D)
        x = x.permute(0, 2, 1)               # (B, D, L)
        conv_outs = []
        for conv in self.convs:
            c = conv(x)                      # (B, F, L - k + 1)
            c = torch.relu(c)                # (B, F, L - k + 1)
            c = torch.max_pool1d(c, kernel_size=c.size(2))  # (B, F, 1)
            conv_outs.append(c.squeeze(2))   # (B, F)
        cat = torch.cat(conv_outs, dim=1)    # (B, F * len(kernel_sizes))
        drop = self.dropout(cat)             # (B, F * len(kernel_sizes))
        logits = self.fc(drop)               # (B, num_classes)
        return logits

# =============================================================================
# 5. 基礎 MLP Model 定義 (只用於 flat 模式)
# =============================================================================
class BasicMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_classes: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, features):
        """
        features: (B, input_dim)
        return: logits (B, num_classes)
        """
        x = self.fc1(features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

# =============================================================================
# 6. 訓練與評估函式
# =============================================================================
def compute_metrics(preds_and_labels) -> Dict[str, float]:
    logits, labels = preds_and_labels
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }

def train_flat_transformer(
    model_name: str,
    num_labels: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    output_dir: str,
    device: torch.device,
    use_dora: bool = False
) -> Dict[str, float]:
    """
    訓練 flat (Chapter or Section) 模式的小型 Transformer (BERT/ RoBERTa)，回傳 metrics。
    如果 use_dora=True，會在模型上套 DoRA Adapter（LoraConfig）。
    """
    # 1. 選擇 tokenizer & base_model
    if "roberta" in model_name:
        tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        base_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    else:
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
        base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # 2. 如果要用 DoRA，包成 PEFT 模型
    if use_dora:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=["query", "value"],
            use_dora=True,
        )
        model = get_peft_model(base_model, peft_config)
    else:
        model = base_model

    model.to(device)

    # 3. 構造 Dataset
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=128, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=128, mode="flat_chapter")

    # 4. TrainingArguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5 if not use_dora else 3e-4,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_strategy="epoch"
    )

    # 5. 初始化 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    # 6. 訓練 & 驗證
    start = time.time()
    trainer.train()
    elapsed = time.time() - start

    metrics = trainer.evaluate()

    # 7. 儲存 adapter（若有 DoRA）或整個模型
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # 8. 計算檔案大小
    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    metrics["eval_accuracy"],
        "precision":   metrics["eval_precision"],
        "recall":      metrics["eval_recall"],
        "f1":          metrics["eval_f1"],
        "train_time":  elapsed,
        "model_size":  total_size
    }

def train_textcnn(
    vocab_size: int,
    train_texts: List[str],
    train_labels: List[int],
    valid_texts: List[str],
    valid_labels: List[int],
    tokenizer,
    num_labels: int,
    output_dir: str,
    device: torch.device
) -> Dict[str, float]:
    """
    訓練 TextCNN (flat 模式)，回傳 metrics (accuracy, precision, recall, f1, train_time, model_size)。
    """
    max_len = 128
    batch_size = 32
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=max_len, mode="flat_chapter")
    valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, max_len=max_len, mode="flat_chapter")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    embed_dim = 300
    model = TextCNN(vocab_size=vocab_size, embed_dim=embed_dim, num_classes=num_labels)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

    best_f1 = 0.0
    best_metrics = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
    start_time = time.time()

    for epoch in range(1, 6):
        model.train()
        total_loss = 0.0
        all_preds, all_labels = [], []
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            logits = model(input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * input_ids.size(0)
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())

        train_loss = total_loss / len(train_loader.dataset)
        train_acc = accuracy_score(all_labels, all_preds)
        train_prec, train_rec, train_f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average="macro", zero_division=0
        )

        model.eval()
        total_vloss = 0.0
        v_preds, v_labels = [], []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                logits = model(input_ids)
                loss = criterion(logits, labels)
                total_vloss += loss.item() * input_ids.size(0)
                preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
                v_preds.extend(preds)
                v_labels.extend(labels.detach().cpu().numpy())

        valid_loss = total_vloss / len(valid_loader.dataset)
        valid_acc = accuracy_score(v_labels, v_preds)
        valid_prec, valid_rec, valid_f1, _ = precision_recall_fscore_support(
            v_labels, v_preds, average="macro", zero_division=0
        )

        scheduler.step()

        if valid_f1 > best_f1:
            best_f1 = valid_f1
            best_metrics = {
                "accuracy":  valid_acc,
                "precision": valid_prec,
                "recall":    valid_rec,
                "f1":        valid_f1
            }
            # 存模型前先確保 output_dir 存在
            os.makedirs(output_dir, exist_ok=True)
            torch.save(model.state_dict(), f"{output_dir}/best_textcnn.pt")

        print(
            f"[TextCNN] Epoch {epoch} | "
            f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f} | "
            f"Valid Loss: {valid_loss:.4f} | Valid Acc: {valid_acc:.4f} | Valid F1: {valid_f1:.4f}"
        )

    elapsed = time.time() - start_time

    total_size = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_size += os.path.getsize(os.path.join(root, fname))

    return {
        "accuracy":    best_metrics["accuracy"],
        "precision":   best_metrics["precision"],
        "recall":      best_metrics["recall"],
        "f1":          best_metrics["f1"],
        "train_time":  elapsed,
        "model_size":  total_size
    }

# =============================================================================
# 7. Pipeline 主程序：整合上述所有 case
# =============================================================================
def main_pipeline():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1. 載入資料與 Labels
    df, subj_map, chap_map, sect_map = load_and_prepare(["math", "science"]) # 社會科改成合適的 subject
    label2id_subject, id2label_subject = subj_map
    label2id_chapter, id2label_chapter = chap_map
    label2id_section, id2label_section = sect_map

    # 2. 切 train/valid/test（90/10 → 再分81/9/10)
    base = df[["ques_detl", "label_subject", "label_chapter", "label_section", "subject"]].copy()

    # 先切出 10% 做最終 test
    rest, test = train_test_split(
        base, test_size=0.1, stratify=base["label_chapter"], random_state=42
    )

    # 再把 rest 切成 90/10 ≈ 81%/9% 做 train/valid
    train_df, valid_df = train_test_split(
        rest, test_size=0.1, stratify=rest["label_chapter"], random_state=42
    )

    # 取出 text 和各種 label lists
    train_texts = train_df["ques_detl"].tolist()
    valid_texts = valid_df["ques_detl"].tolist()
    test_texts  = test["ques_detl"].tolist()

    train_subj_labels = train_df["label_subject"].tolist()
    valid_subj_labels = valid_df["label_subject"].tolist()
    test_subj_labels  = test["label_subject"].tolist()

    train_chap_labels = train_df["label_chapter"].tolist()
    valid_chap_labels = valid_df["label_chapter"].tolist()
    test_chap_labels  = test["label_chapter"].tolist()

    train_sect_labels = train_df["label_section"].tolist()
    valid_sect_labels = valid_df["label_section"].tolist()
    test_sect_labels  = test["label_section"].tolist()

    # 3. 建立 tokenizer (給所有 TextCNN/Transformer 共用)
    bert_tok    = BertTokenizerFast.from_pretrained("bert-base-uncased")
    roberta_tok = RobertaTokenizerFast.from_pretrained("roberta-base")

    # 4. 建立詞彙大小 (TextCNN 用)
    vocab_size = bert_tok.vocab_size

    # 5. 實驗配置
    model_types = [
        # 範例：("BERT", "bert-base-uncased", False),
        #        ("RoBERTa", "roberta-base", False),
        #        ("DoRA_BERT", "bert-base-uncased", True),
        #        ("DoRA_RoBERTa", "roberta-base", True),
    ]
    others = ["MLP"] #"TextCNN",

    # 6. 三種分類策略
    strategies = [
        "flat_chapter",          # 直接分類 subject+chapter
        "flat_section",          # 直接分類 subject+chapter+section
        "hierarchical"           # 先分 subject，再分 chapter
    ]

    results = []

    # (A) 先跑 Transformer 部分 (示例)
    for model_name, hf_model_name, use_dora in model_types:
        for strat in strategies:
            exp_name = f"{model_name}__{strat}"
            output_dir = f"./outputs/{exp_name}"
            os.makedirs(output_dir, exist_ok=True)

            print(f"\n▶開始實驗 (Transformer): {exp_name}")

            if strat == "flat_chapter":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_chapter),
                    train_texts=train_texts,
                    train_labels=train_chap_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_chap_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                results.append((exp_name, metrics))

            elif strat == "flat_section":
                metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_section),
                    train_texts=train_texts,
                    train_labels=train_sect_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_sect_labels,
                    output_dir=output_dir,
                    device=device,
                    use_dora=use_dora
                )
                results.append((exp_name, metrics))

            elif strat == "hierarchical":
                # hierarchical: 先訓練 subject，再分 subj_id 針對 chapter
                sub_output = os.path.join(output_dir, "subj_model")
                chap_output = os.path.join(output_dir, "chapter_model")
                os.makedirs(sub_output, exist_ok=True)
                os.makedirs(chap_output, exist_ok=True)

                print("  [Hierarchical] Step 1: 訓練 subject 模型")
                sub_metrics = train_flat_transformer(
                    model_name=hf_model_name,
                    num_labels=len(label2id_subject),
                    train_texts=train_texts,
                    train_labels=train_subj_labels,
                    valid_texts=valid_texts,
                    valid_labels=valid_subj_labels,
                    output_dir=sub_output,
                    device=device,
                    use_dora=use_dora
                )

                chap_metrics = {}
                print("  [Hierarchical] Step 2: 針對每個 subject 訓練 chapter 分類器")
                for subj_id, subj_name in id2label_subject.items():
                    idx_train = [i for i, s in enumerate(train_subj_labels) if s == subj_id]
                    idx_valid = [i for i, s in enumerate(valid_subj_labels) if s == subj_id]

                    sub_train_texts = [train_texts[i] for i in idx_train]
                    sub_train_chaps = [train_chap_labels[i] for i in idx_train]
                    sub_valid_texts = [valid_texts[i] for i in idx_valid]
                    sub_valid_chaps = [valid_chap_labels[i] for i in idx_valid]

                    unique_chaps = sorted({train_chap_labels[i] for i in idx_train})
                    if len(unique_chaps) < 2:
                        continue

                    sub_dir = os.path.join(chap_output, f"subj_{subj_id}")
                    os.makedirs(sub_dir, exist_ok=True)

                    print(f"    [Subject {subj_id}] Training chapter classifier (num_labels={len(unique_chaps)})")
                    sub_chap_m = train_flat_transformer(
                        model_name=hf_model_name,
                        num_labels=len(unique_chaps),
                        train_texts=sub_train_texts,
                        train_labels=sub_train_chaps,
                        valid_texts=sub_valid_texts,
                        valid_labels=sub_valid_chaps,
                        output_dir=sub_dir,
                        device=device,
                        use_dora=use_dora
                    )
                    chap_metrics[subj_id] = sub_chap_m

                results.append((exp_name, {"subject_metrics": sub_metrics, "chapter_metrics": chap_metrics}))

    # =============================================================================
    # (B) TextCNN + MLP 只負責三種策略 (flat_chapter, flat_section, hierarchical)
    # =============================================================================
    for model_name in others:
        for strat in strategies:
            exp_name = f"{model_name}__{strat}"
            output_dir = f"./outputs/{exp_name}"
            os.makedirs(output_dir, exist_ok=True)
            print(f"\n▶開始實驗 (TextCNN/MLP): {exp_name}")

            # 如果是 hierarchical，先在這裡幫它建好 subdir
            if strat == "hierarchical":
                os.makedirs(os.path.join(output_dir, "subj_model"), exist_ok=True)
                os.makedirs(os.path.join(output_dir, "chapter_model"), exist_ok=True)

            if strat == "flat_chapter":
                cur_train_labels = train_chap_labels
                cur_valid_labels = valid_chap_labels
                num_labels = len(label2id_chapter)

            elif strat == "flat_section":
                cur_train_labels = train_sect_labels
                cur_valid_labels = valid_sect_labels
                num_labels = len(label2id_section)

            elif strat == "hierarchical":
                cur_results = {"subject": {}, "chapter": {}}

                # Step1: 訓練 subject 分類器
                print("  [Hierarchical] Step1: 訓練 subject 分類器")
                # 先確保 subj_model 存在
                subj_model_dir = os.path.join(output_dir, "subj_model")
                os.makedirs(subj_model_dir, exist_ok=True)

                if model_name == "TextCNN":
                    # 用 chapter labels 當作 proxy 來當作 subject 分類（TextCNN 只支援 flat_chapter）
                    sub_metrics = train_textcnn(
                        vocab_size=vocab_size,
                        train_texts=train_texts,
                        train_labels=train_subj_labels,
                        valid_texts=valid_texts,
                        valid_labels=valid_subj_labels,
                        tokenizer=bert_tok,
                        num_labels=len(label2id_subject),
                        output_dir=subj_model_dir,
                        device=device
                    )
                else:  # MLP 版 subject
                    enc_train = bert_tok(train_texts, padding=True, truncation=True, return_tensors="pt")
                    enc_valid = bert_tok(valid_texts, padding=True, truncation=True, return_tensors="pt")

                    bert_model = BertForSequenceClassification.from_pretrained(
                        "bert-base-uncased", output_hidden_states=True
                    ).to(device)
                    bert_model.eval()

                    def encode_avg(inputs, batch_size=32):
                        input_ids = inputs["input_ids"]
                        attention_mask = inputs["attention_mask"]
                        feats = []
                        with torch.no_grad():
                            for i in range(0, input_ids.size(0), batch_size):
                                end_i = min(i + batch_size, input_ids.size(0))
                                b_ids = input_ids[i:end_i].to(device)
                                b_att = attention_mask[i:end_i].to(device)
                                outs = bert_model.bert(b_ids, attention_mask=b_att).last_hidden_state
                                mask = b_att.unsqueeze(-1).expand_as(outs).float()
                                summed = torch.sum(outs * mask, dim=1)
                                counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                                avg = (summed / counts).cpu()
                                feats.append(avg)
                                del b_ids, b_att, outs, mask, summed, counts
                                torch.cuda.empty_cache()
                        return torch.cat(feats, dim=0)

                    subj_train_feats = encode_avg(enc_train)
                    subj_valid_feats = encode_avg(enc_valid)

                    subj_model = BasicMLP(subj_train_feats.size(1), 256, len(label2id_subject)).to(device)
                    subj_train_ds = TensorDataset(subj_train_feats, torch.tensor(train_subj_labels))
                    subj_valid_ds = TensorDataset(subj_valid_feats, torch.tensor(valid_subj_labels))
                    subj_train_loader = DataLoader(subj_train_ds, batch_size=64, shuffle=True)
                    subj_valid_loader = DataLoader(subj_valid_ds, batch_size=64, shuffle=False)

                    opt = optim.Adam(subj_model.parameters(), lr=1e-4)
                    crit = nn.CrossEntropyLoss()

                    best_f1_s = 0.0
                    best_metrics_s = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
                    st = time.time()
                    for epoch in range(1, 6):
                        subj_model.train()
                        train_preds, train_labels_acc = [], []
                        run_loss = 0.0
                        for xb, yb in subj_train_loader:
                            xb, yb = xb.to(device), yb.to(device)
                            opt.zero_grad()
                            logits = subj_model(xb)
                            loss = crit(logits, yb)
                            loss.backward()
                            opt.step()
                            run_loss += loss.item() * xb.size(0)
                            pr = torch.argmax(logits, dim=1).cpu().numpy()
                            train_preds.extend(pr)
                            train_labels_acc.extend(yb.cpu().numpy())

                        train_acc_s = accuracy_score(train_labels_acc, train_preds)
                        _, _, train_f1_s, _ = precision_recall_fscore_support(
                            train_labels_acc, train_preds, average="macro", zero_division=0
                        )

                        subj_model.eval()
                        val_preds_s, val_labels_s = [], []
                        vloss_s = 0.0
                        with torch.no_grad():
                            for xb, yb in subj_valid_loader:
                                xb, yb = xb.to(device), yb.to(device)
                                logits = subj_model(xb)
                                loss = crit(logits, yb)
                                vloss_s += loss.item() * xb.size(0)
                                pr = torch.argmax(logits, dim=1).cpu().numpy()
                                val_preds_s.extend(pr)
                                val_labels_s.extend(yb.cpu().numpy())

                        valid_acc_s = accuracy_score(val_labels_s, val_preds_s)
                        _, _, valid_f1_s, _ = precision_recall_fscore_support(
                            val_labels_s, val_preds_s, average="macro", zero_division=0
                        )

                        if valid_f1_s > best_f1_s:
                            best_f1_s = valid_f1_s
                            best_metrics_s = {
                                "accuracy":  valid_acc_s,
                                "precision": precision_score(val_labels_s, val_preds_s, average="macro", zero_division=0),
                                "recall":    recall_score(val_labels_s, val_preds_s, average="macro", zero_division=0),
                                "f1":        valid_f1_s
                            }
                            # 確保子目錄存在
                            save_dir_s = os.path.join(output_dir, "subj_model")
                            os.makedirs(save_dir_s, exist_ok=True)
                            torch.save(subj_model.state_dict(), os.path.join(save_dir_s, "best_subj_mlp.pt"))

                        print(
                            f"    [Subject MLP Epoch {epoch}] "
                            f"Train Acc: {train_acc_s:.4f} | Train F1: {train_f1_s:.4f} | "
                            f"Valid Acc: {valid_acc_s:.4f} | Valid F1: {valid_f1_s:.4f}"
                        )

                    elapsed_s = time.time() - st
                    total_sz_s = 0
                    save_dir_s = os.path.join(output_dir, "subj_model")
                    for root, _, files in os.walk(save_dir_s):
                        for fname in files:
                            total_sz_s += os.path.getsize(os.path.join(root, fname))
                    sub_metrics = {
                        "accuracy":    best_metrics_s["accuracy"],
                        "precision":   best_metrics_s["precision"],
                        "recall":      best_metrics_s["recall"],
                        "f1":          best_metrics_s["f1"],
                        "train_time":  elapsed_s,
                        "model_size":  total_sz_s
                    }

                cur_results["subject"][model_name] = sub_metrics  # TextCNN 或 MLP 的 subject 部分

                # Step2: 針對每個 subject 訓練 chapter 分類器
                print("  [Hierarchical] Step2: 針對每個 subject 訓練 chapter 分類器")
                for subj_id, subj_name in id2label_subject.items():
                    idx_train = [i for i, s in enumerate(train_subj_labels) if s == subj_id]
                    idx_valid = [i for i, s in enumerate(valid_subj_labels) if s == subj_id]

                    sub_train_texts = [train_texts[i] for i in idx_train]
                    sub_train_chaps = [train_chap_labels[i] for i in idx_train]
                    sub_valid_texts = [valid_texts[i] for i in idx_valid]
                    sub_valid_chaps = [valid_chap_labels[i] for i in idx_valid]

                    unique_chaps = sorted({train_chap_labels[i] for i in idx_train})
                    if len(unique_chaps) < 2:
                        continue

                    chap_dir = os.path.join(output_dir, "chapter_model", f"subj_{subj_id}")
                    os.makedirs(chap_dir, exist_ok=True)

                    if model_name == "TextCNN":
                        chap_m = train_textcnn(
                            vocab_size=vocab_size,
                            train_texts=sub_train_texts,
                            train_labels=sub_train_chaps,
                            valid_texts=sub_valid_texts,
                            valid_labels=sub_valid_chaps,
                            tokenizer=bert_tok,
                            num_labels=len(unique_chaps),
                            output_dir=chap_dir,
                            device=device
                        )
                    else:  # MLP
                        enc_tr = bert_tok(sub_train_texts, padding=True, truncation=True, return_tensors="pt")
                        enc_vd = bert_tok(sub_valid_texts, padding=True, truncation=True, return_tensors="pt")
                        # 載入 BERT 作特徵擷取
                        bert_model = BertForSequenceClassification.from_pretrained(
                            "bert-base-uncased", output_hidden_states=True
                        ).to(device)
                        bert_model.eval()

                        def encode_avg(inputs, batch_size=32):
                            input_ids = inputs["input_ids"]
                            attention_mask = inputs["attention_mask"]
                            feats = []
                            with torch.no_grad():
                                for i in range(0, input_ids.size(0), batch_size):
                                    end_i = min(i + batch_size, input_ids.size(0))
                                    b_ids = input_ids[i:end_i].to(device)
                                    b_att = attention_mask[i:end_i].to(device)
                                    outs = bert_model.bert(b_ids, attention_mask=b_att).last_hidden_state
                                    mask = b_att.unsqueeze(-1).expand_as(outs).float()
                                    summed = torch.sum(outs * mask, dim=1)
                                    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                                    avg = (summed / counts).cpu()
                                    feats.append(avg)
                                    del b_ids, b_att, outs, mask, summed, counts
                                    torch.cuda.empty_cache()
                            return torch.cat(feats, dim=0)

                        tr_feats = encode_avg(enc_tr)
                        vd_feats = encode_avg(enc_vd)

                        mlp_model = BasicMLP(tr_feats.size(1), 256, len(unique_chaps)).to(device)
                        tr_ds = TensorDataset(tr_feats, torch.tensor(sub_train_chaps))
                        vd_ds = TensorDataset(vd_feats, torch.tensor(sub_valid_chaps))
                        tr_loader = DataLoader(tr_ds, batch_size=64, shuffle=True)
                        vd_loader = DataLoader(vd_ds, batch_size=64, shuffle=False)

                        opt_mc = optim.Adam(mlp_model.parameters(), lr=1e-4)
                        crit_mc = nn.CrossEntropyLoss()

                        best_f1_c = 0.0
                        best_metrics_c = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
                        st_c = time.time()
                        for epoch in range(1, 6):
                            mlp_model.train()
                            preds_tr, labels_tr = [], []
                            runloss = 0.0
                            for xb, yb in tr_loader:
                                xb, yb = xb.to(device), yb.to(device)
                                opt_mc.zero_grad()
                                logits = mlp_model(xb)
                                loss = crit_mc(logits, yb)
                                loss.backward()
                                opt_mc.step()
                                runloss += loss.item() * xb.size(0)
                                pr = torch.argmax(logits, dim=1).cpu().numpy()
                                preds_tr.extend(pr)
                                labels_tr.extend(yb.cpu().numpy())
                            acc_tr_c = accuracy_score(labels_tr, preds_tr)
                            _, _, f1_tr_c, _ = precision_recall_fscore_support(
                                labels_tr, preds_tr, average="macro", zero_division=0
                            )

                            mlp_model.eval()
                            preds_vd, labels_vd = [], []
                            vl = 0.0
                            with torch.no_grad():
                                for xb, yb in vd_loader:
                                    xb, yb = xb.to(device), yb.to(device)
                                    logits = mlp_model(xb)
                                    loss = crit_mc(logits, yb)
                                    vl += loss.item() * xb.size(0)
                                    pr = torch.argmax(logits, dim=1).cpu().numpy()
                                    preds_vd.extend(pr)
                                    labels_vd.extend(yb.cpu().numpy())
                            acc_vd_c = accuracy_score(labels_vd, preds_vd)
                            _, _, f1_vd_c, _ = precision_recall_fscore_support(
                                labels_vd, preds_vd, average="macro", zero_division=0
                            )
                            if f1_vd_c > best_f1_c:
                                best_f1_c = f1_vd_c
                                best_metrics_c = {
                                    "accuracy":  acc_vd_c,
                                    "precision": precision_score(labels_vd, preds_vd, average="macro", zero_division=0),
                                    "recall":    recall_score(labels_vd, preds_vd, average="macro", zero_division=0),
                                    "f1":        f1_vd_c
                                }
                                # 確保 chap_dir 存在
                                os.makedirs(chap_dir, exist_ok=True)
                                torch.save(mlp_model.state_dict(), os.path.join(chap_dir, "best_chap_mlp.pt"))

                            print(
                                f"    [Chapter MLP subj_{subj_id} Epoch {epoch}] "
                                f"Train Acc: {acc_tr_c:.4f} | Train F1: {f1_tr_c:.4f} | "
                                f"Valid Acc: {acc_vd_c:.4f} | Valid F1: {f1_vd_c:.4f}"
                            )

                        elapsed_c = time.time() - st_c
                        total_sz_c = 0
                        for root, _, files in os.walk(chap_dir):
                            for fname in files:
                                total_sz_c += os.path.getsize(os.path.join(root, fname))

                        chap_m = {
                            "accuracy":    best_metrics_c["accuracy"],
                            "precision":   best_metrics_c["precision"],
                            "recall":      best_metrics_c["recall"],
                            "f1":          best_metrics_c["f1"],
                            "train_time":  elapsed_c,
                            "model_size":  total_sz_c
                        }

                    cur_results["chapter"][subj_id] = chap_m

                results.append((exp_name, cur_results))
                continue  # 跳過後面的 flat 處理

            # 以下為 flat_chapter 或 flat_section 的 TextCNN/MLP 處理
            if strat in ["flat_chapter", "flat_section"]:
                if strat == "flat_chapter":
                    cur_train_labels = train_chap_labels
                    cur_valid_labels = valid_chap_labels
                    num_labels = len(label2id_chapter)
                else:  # flat_section
                    cur_train_labels = train_sect_labels
                    cur_valid_labels = valid_sect_labels
                    num_labels = len(label2id_section)

                if model_name == "TextCNN":
                    print("  [TextCNN] Step1: 開始訓練")
                    start = time.time()

                    metrics = train_textcnn(
                        vocab_size=vocab_size,
                        train_texts=train_texts,
                        train_labels=cur_train_labels,
                        valid_texts=valid_texts,
                        valid_labels=cur_valid_labels,
                        tokenizer=bert_tok,
                        num_labels=num_labels,
                        output_dir=output_dir,
                        device=device
                    )
                    elapsed = time.time() - start
                    metrics["train_time"] = elapsed

                    print(f"    → accuracy:  {metrics['accuracy']:.4f}")
                    print(f"    → precision: {metrics['precision']:.4f}")
                    print(f"    → recall:    {metrics['recall']:.4f}")
                    print(f"    → f1:        {metrics['f1']:.4f}")
                    print(f"    → model_size: {metrics['model_size']} bytes")

                    results.append((exp_name, metrics))

                else:  # MLP
                    print("  [MLP] Step1: 用 tokenizer 編碼到 CPU")
                    train_enc = bert_tok(
                        train_texts,
                        padding=True,
                        truncation=True,
                        return_tensors="pt"
                    )
                    valid_enc = bert_tok(
                        valid_texts,
                        padding=True,
                        truncation=True,
                        return_tensors="pt"
                    )
                    print(f"    → train_enc.shape: {train_enc['input_ids'].shape}")
                    print(f"    → valid_enc.shape: {train_enc['input_ids'].shape}")

                    print("  [MLP] Step2: 載入 BERT 做特徵擷取")
                    bert_model = BertForSequenceClassification.from_pretrained(
                        "bert-base-uncased", output_hidden_states=True
                    ).to(device)
                    bert_model.eval()

                    def encode_avg_dataset(inputs, batch_size: int = 32):
                        input_ids = inputs["input_ids"]
                        attention_mask = inputs["attention_mask"]
                        features_list = []

                        with torch.no_grad():
                            for i in range(0, input_ids.size(0), batch_size):
                                end = min(i + batch_size, input_ids.size(0))
                                b_ids = input_ids[i:end].to(device)
                                b_att = attention_mask[i:end].to(device)
                                outs = bert_model.bert(b_ids, attention_mask=b_att).last_hidden_state
                                mask = b_att.unsqueeze(-1).expand_as(outs).float()
                                summed = torch.sum(outs * mask, dim=1)
                                counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                                avg = (summed / counts).cpu()
                                features_list.append(avg)
                                del b_ids, b_att, outs, mask, summed, counts
                                torch.cuda.empty_cache()
                        return torch.cat(features_list, dim=0)

                    print("  [MLP] Step3: 計算 train_feats")
                    train_feats = encode_avg_dataset(train_enc, batch_size=32)
                    print(f"    → train_feats.shape = {train_feats.shape}")

                    print("  [MLP] Step4: 計算 valid_feats")
                    valid_feats = encode_avg_dataset(valid_enc, batch_size=32)
                    print(f"    → valid_feats.shape = {valid_feats.shape}")

                    print("  [MLP] Step5: 建立 MLP 並訓練")
                    input_dim = train_feats.size(1)
                    hidden_dim = 256
                    model = BasicMLP(input_dim, hidden_dim, num_labels).to(device)

                    train_labels_tensor = torch.tensor(cur_train_labels, dtype=torch.long)
                    valid_labels_tensor = torch.tensor(cur_valid_labels, dtype=torch.long)

                    train_ds = TensorDataset(train_feats, train_labels_tensor)
                    valid_ds = TensorDataset(valid_feats, valid_labels_tensor)
                    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
                    valid_loader = DataLoader(valid_ds, batch_size=64, shuffle=False)

                    optimizer = optim.Adam(model.parameters(), lr=1e-4)
                    criterion = nn.CrossEntropyLoss()

                    start = time.time()
                    best_f1 = 0.0
                    best_metrics = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

                    for epoch in range(1, 6):
                        model.train()
                        epoch_loss = 0.0
                        train_preds, train_labels_acc = [], []

                        for feats_batch, labels_batch in train_loader:
                            feats_batch = feats_batch.to(device)
                            labels_batch = labels_batch.to(device)

                            optimizer.zero_grad()
                            logits = model(feats_batch)
                            loss = criterion(logits, labels_batch)
                            loss.backward()
                            optimizer.step()

                            epoch_loss += loss.item() * feats_batch.size(0)
                            preds = torch.argmax(logits, dim=1).cpu().numpy()
                            train_preds.extend(preds)
                            train_labels_acc.extend(labels_batch.cpu().numpy())

                        train_acc = accuracy_score(train_labels_acc, train_preds)
                        train_prec, train_rec, train_f1, _ = precision_recall_fscore_support(
                            train_labels_acc, train_preds, average="macro", zero_division=0
                        )
                        avg_train_loss = epoch_loss / len(train_loader.dataset)

                        model.eval()
                        valid_preds, valid_labels_acc = [], []
                        vloss = 0.0
                        with torch.no_grad():
                            for feats_batch, labels_batch in valid_loader:
                                feats_batch = feats_batch.to(device)
                                labels_batch = labels_batch.to(device)
                                logits = model(feats_batch)
                                loss = criterion(logits, labels_batch)
                                vloss += loss.item() * feats_batch.size(0)
                                preds = torch.argmax(logits, dim=1).cpu().numpy()
                                valid_preds.extend(preds)
                                valid_labels_acc.extend(labels_batch.cpu().numpy())

                        valid_acc = accuracy_score(valid_labels_acc, valid_preds)
                        valid_prec, valid_rec, valid_f1, _ = precision_recall_fscore_support(
                            valid_labels_acc, valid_preds, average="macro", zero_division=0
                        )

                        if valid_f1 > best_f1:
                            best_f1 = valid_f1
                            best_metrics = {
                                "accuracy":  valid_acc,
                                "precision": valid_prec,
                                "recall":    valid_rec,
                                "f1":        valid_f1
                            }
                            # 確保 output_dir 存在
                            os.makedirs(output_dir, exist_ok=True)
                            torch.save(model.state_dict(), f"{output_dir}/best_mlp.pt")

                        print(
                            f"[MLP Epoch {epoch}] "
                            f"Train Loss: {avg_train_loss:.4f} | "
                            f"Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f} | "
                            f"Valid Acc: {valid_acc:.4f} | Valid F1: {valid_f1:.4f}"
                        )

                    elapsed = time.time() - start
                    total_size = 0
                    for root, _, files in os.walk(output_dir):
                        for fname in files:
                            total_size += os.path.getsize(os.path.join(root, fname))

                    best_metrics["train_time"] = elapsed
                    best_metrics["model_size"] = total_size
                    print(f"    → accuracy:  {best_metrics['accuracy']:.4f}")
                    print(f"    → precision: {best_metrics['precision']:.4f}")
                    print(f"    → recall:    {best_metrics['recall']:.4f}")
                    print(f"    → f1:        {best_metrics['f1']:.4f}")
                    print(f"    → model_size: {best_metrics['model_size']} bytes")

                    results.append((exp_name, best_metrics))

    # =============================================================================
    # 8. 把所有實驗結果匯出
    # =============================================================================
    out_df = pd.DataFrame([{"experiment": name, **metrics} for name, metrics in results])
    out_df.to_csv("experiment_results.csv", index=False)
    print("\n所有實驗完成，結果已存到 experiment_results.csv")

if __name__ == "__main__":
    main_pipeline()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



▶開始實驗 (TextCNN/MLP): MLP__flat_chapter
  [MLP] Step1: 用 tokenizer 編碼到 CPU
    → train_enc.shape: torch.Size([13469, 512])
    → valid_enc.shape: torch.Size([13469, 512])
  [MLP] Step2: 載入 BERT 做特徵擷取


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [MLP] Step3: 計算 train_feats
    → train_feats.shape = torch.Size([13469, 768])
  [MLP] Step4: 計算 valid_feats
    → valid_feats.shape = torch.Size([1497, 768])
  [MLP] Step5: 建立 MLP 並訓練
[MLP Epoch 1] Train Loss: 3.8882 | Train Acc: 0.0771 | Train F1: 0.0159 | Valid Acc: 0.1149 | Valid F1: 0.0187
[MLP Epoch 2] Train Loss: 3.5708 | Train Acc: 0.1150 | Train F1: 0.0306 | Valid Acc: 0.1630 | Valid F1: 0.0372
[MLP Epoch 3] Train Loss: 3.3779 | Train Acc: 0.1402 | Train F1: 0.0440 | Valid Acc: 0.1790 | Valid F1: 0.0433
[MLP Epoch 4] Train Loss: 3.2432 | Train Acc: 0.1634 | Train F1: 0.0567 | Valid Acc: 0.1884 | Valid F1: 0.0522
[MLP Epoch 5] Train Loss: 3.1460 | Train Acc: 0.1728 | Train F1: 0.0628 | Valid Acc: 0.2098 | Valid F1: 0.0642
    → accuracy:  0.2098
    → precision: 0.0738
    → recall:    0.1001
    → f1:        0.0642
    → model_size: 853272 bytes

▶開始實驗 (TextCNN/MLP): MLP__flat_section
  [MLP] Step1: 用 tokenizer 編碼到 CPU
    → train_enc.shape: torch.Size([13469, 512])
    → va

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [MLP] Step3: 計算 train_feats
    → train_feats.shape = torch.Size([13469, 768])
  [MLP] Step4: 計算 valid_feats
    → valid_feats.shape = torch.Size([1497, 768])
  [MLP] Step5: 建立 MLP 並訓練
[MLP Epoch 1] Train Loss: 5.0378 | Train Acc: 0.0279 | Train F1: 0.0031 | Valid Acc: 0.0508 | Valid F1: 0.0028
[MLP Epoch 2] Train Loss: 4.7460 | Train Acc: 0.0511 | Train F1: 0.0065 | Valid Acc: 0.0735 | Valid F1: 0.0052
[MLP Epoch 3] Train Loss: 4.5397 | Train Acc: 0.0679 | Train F1: 0.0104 | Valid Acc: 0.0828 | Valid F1: 0.0103
[MLP Epoch 4] Train Loss: 4.4081 | Train Acc: 0.0760 | Train F1: 0.0137 | Valid Acc: 0.0908 | Valid F1: 0.0119
[MLP Epoch 5] Train Loss: 4.2942 | Train Acc: 0.0852 | Train F1: 0.0171 | Valid Acc: 0.1129 | Valid F1: 0.0214
    → accuracy:  0.1129
    → precision: 0.0185
    → recall:    0.0419
    → f1:        0.0214
    → model_size: 980760 bytes

▶開始實驗 (TextCNN/MLP): MLP__hierarchical
  [Hierarchical] Step1: 訓練 subject 分類器


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'precision_score' is not defined