## Packages import

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
%pip install pytesseract pandas scikit-learn transformers json
%pip install torch

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import json

# Optional OCR tools
import pytesseract
from PIL import Image
# or use EasyOCR:
# import easyocr

[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for json[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# PyTorch
import torch

# Step 1: 自動選擇 device
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("使用 Apple M1 GPU (MPS)")
else:
    device = torch.device("cpu")
    print("使用 CPU（找不到 MPS）")
# # Check GPU
# if torch.cuda.is_available():
#     device = torch.device('mps')
#     print(f'使用 GPU：{torch.cuda.get_device_name(0)}')
# else:
#     device = torch.device('cpu')
#     print('使用 CPU')

使用 Apple M1 GPU (MPS)


## Pre Processing

In [4]:
def load_and_merge(subject: str) -> pd.DataFrame:
    """
    Load question bank and chapter list CSVs for a given subject from Database/{subject}_Database and merge on chapter ID.

    Args:
        subject: 'math' or 'social'
    Returns:
        DataFrame with columns: question_text, chapter_name, subject
    """
    base_path = f"Database/{subject}_Database"
    qbank_path = f"{base_path}/{subject}_question_bank.csv"
    chap_path = f"{base_path}/{subject}_chapter_list.csv"
    
    qdf = pd.read_csv(qbank_path)
    cdf = pd.read_csv(chap_path)
    qdf.columns = qdf.columns.str.strip().str.lower().str.replace(' ', '_')
    cdf.columns = cdf.columns.str.strip().str.lower().str.replace(' ', '_')

    print("Question Bank Columns:", qdf.columns.tolist())
    print("Chapter List Columns:", cdf.columns.tolist())


    df = qdf.merge(cdf, on='section_name', how='left')
    df['subject'] = subject
    return df

# Load and combine data from correct paths
df = pd.concat([load_and_merge('math'), load_and_merge('social')], ignore_index=True)

df.count()
df.head()


Question Bank Columns: ['subject', 'ques_no', 'chapter_name', 'section_name', 'ques_detl']
Chapter List Columns: ['year_grade', 'book', 'chapter_num', 'chapter_name', 'section_num', 'section_name']
Question Bank Columns: ['subject', 'ques_no', 'chapter_name', 'section_name', 'ques_detl']
Chapter List Columns: ['year_grade', 'book', 'chapter_num', 'chapter_name', 'section_num', 'section_name']


Unnamed: 0,subject,ques_no,chapter_name_x,section_name,ques_detl,year_grade,book,chapter_num,chapter_name_y,section_num
0,math,2MA0810001,整數的運算,負數與數線,以中午12時為基準，下午3時記作＋3，那麼上午10時可以記作多少？(A)＋10(B)－10(...,8.0,1.0,1.0,整數的運算,1.0
1,math,2MA0810002,整數的運算,負數與數線,若數學科成績以30分為基準，得40分記作＋10，那麼考25分可記作多少？(A)－25　(B...,8.0,1.0,1.0,整數的運算,1.0
2,math,2MA0810003,整數的運算,負數與數線,如果以正午12時為基準，當日下午4時用＋4表示，則當日上午8時應記為多少？(A)－8　(B...,8.0,1.0,1.0,整數的運算,1.0
3,math,2MA0810004,整數的運算,負數與數線,如果以中午12時為基準，當日下午2時記為＋8，則當日上午6時應記為多少？(A)－24(B)－...,8.0,1.0,1.0,整數的運算,1.0
4,math,2MA0810005,整數的運算,負數與數線,如果＋40°表示北緯40°，則南緯30°該如何表示？(A)30°　(B)－30° (C)...,8.0,1.0,1.0,整數的運算,1.0


In [5]:
df = df[['subject','chapter_name_x','section_name','ques_detl']]
df.head()

Unnamed: 0,subject,chapter_name_x,section_name,ques_detl
0,math,整數的運算,負數與數線,以中午12時為基準，下午3時記作＋3，那麼上午10時可以記作多少？(A)＋10(B)－10(...
1,math,整數的運算,負數與數線,若數學科成績以30分為基準，得40分記作＋10，那麼考25分可記作多少？(A)－25　(B...
2,math,整數的運算,負數與數線,如果以正午12時為基準，當日下午4時用＋4表示，則當日上午8時應記為多少？(A)－8　(B...
3,math,整數的運算,負數與數線,如果以中午12時為基準，當日下午2時記為＋8，則當日上午6時應記為多少？(A)－24(B)－...
4,math,整數的運算,負數與數線,如果＋40°表示北緯40°，則南緯30°該如何表示？(A)30°　(B)－30° (C)...


In [6]:
df.count()

subject           33839
chapter_name_x    33839
section_name      33839
ques_detl         33839
dtype: int64

In [7]:
len(df)

33839

In [8]:
# 2. Label Encoding
# Combine subject and chapter_name as classification label

df['label_str'] = df['subject'] + '::' + df['chapter_name_x']
label2id = {lab: i for i, lab in enumerate(sorted(df['label_str'].unique()))}
id2label = {i: lab for lab, i in label2id.items()}
df['label'] = df['label_str'].map(label2id)

In [9]:
# 3. Train/Test Split
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)

# 4. Tokenization and Dataset Definition
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class QDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(text,
                                 padding='max_length',
                                 truncation=True,
                                 max_length=self.max_len,
                                 return_tensors='pt')
        item = {k: v.squeeze() for k, v in inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = QDataset(train_df['ques_detl'].tolist(), train_df['label'].tolist(), tokenizer)
test_dataset = QDataset(test_df['ques_detl'].tolist(), test_df['label'].tolist(), tokenizer)


In [10]:
# LoRA/DoRA 都在 PEFT 裏
%pip install -U peft           # >= 0.10 才有 use_dora 參數
%pip install -U transformers   # 你本來就有
%pip install -U accelerate     # 與 Trainer 配合

from peft import LoraConfig, get_peft_model


# 5. Model Initialization

# 給 BERT 做分類任務常見的幾個線性層；偷懶也可以 target_modules="all-linear"
dora_config = LoraConfig(
    r=8,                     # rank；小模型 r=8~16 通常夠用
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",             # 只訓練 LoRA / DoRA 權重
    task_type="SEQ_CLS",     # 句子分類
    target_modules=["query", "key", "value", "dense"],
    use_dora=True            # ✨ 關鍵開關
)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels= len(label2id)
)
model = get_peft_model(model, dora_config)
model.print_trainable_parameters()  # 檢查只有 DoRA 權重在訓練


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,453,095 || all params: 110,965,326 || trainable%: 1.3095


In [11]:
%pip uninstall pytorch  # ← 先移除錯的包
%pip install torch --upgrade

%pip install -U accelerate
%pip install -U transformers
%pip install --upgrade transformers


# 6. Training Arguments and Trainer Setup
training_args = TrainingArguments(
    output_dir='./outputs',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
    
)

# Define metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = (preds == labels).mean()
    return {'accuracy': acc}

def clean_collate_fn(batch):
    # 只保留模型需要的欄位
    valid_keys = {"input_ids", "attention_mask", "token_type_ids", "labels"}
    collated = {}
    for key in valid_keys:
        if key in batch[0]:
            collated[key] = torch.stack([item[key] for item in batch])
    return collated

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=clean_collate_fn  
)


[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Note: you may need to restart the kernel to use updated packages.


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# 7. Train and Evaluate
if __name__ == '__main__':
    trainer.train()
    metrics = trainer.evaluate()
    print(f"Test Accuracy: {metrics['eval_accuracy']:.4f}")

# 8. Save Model and Label Mappings
    model.merge_and_unload()
    os.makedirs('./saved_model', exist_ok=True)
    model.save_pretrained('./saved_model')
    tokenizer.save_pretrained('./saved_model')
    with open('./saved_model/label2id.json', 'w') as f:
        json.dump(label2id, f)
    with open('./saved_model/id2label.json', 'w') as f:
        json.dump(id2label, f)



Epoch,Training Loss,Validation Loss


In [12]:
# 9. Inference Pipeline
def ocr_to_text(image_path: str) -> str:
    # Using pytesseract
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img, lang='eng')
    return text.strip()
    # Or using EasyOCR:
    # reader = easyocr.Reader(['en'])
    # result = reader.readtext(image_path, detail=0)
    # return ' '.join(result)


def load_classifier(model_dir: str = './saved_model'):
    tokenizer = BertTokenizerFast.from_pretrained(model_dir)
    model = BertForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    return tokenizer, model


def predict_text(text: str, tokenizer, model):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_id = logits.argmax(dim=-1).item()
    label = id2label[pred_id]
    subject, chapter = label.split('::')
    return subject, chapter


def test_image(image_path: str):
    print("Processing:", image_path)
    text = ocr_to_text(image_path)
    print("OCR Result:\n", text)
    tokenizer, model = load_classifier()
    subject, chapter_name = predict_text(text, tokenizer, model)
    print(f"Predicted Subject: {subject}\nPredicted Chapter: {chapter_name}")

def test_text(text):
    tokenizer, model = load_classifier()
    subject, chapter_name = predict_text(text, tokenizer, model)
    print(f"Predicted Subject: {subject}\nPredicted Chapter: {chapter_name}")

# Usage examples
if __name__ == '__main__':
    # 1. Train and save
    #train()
    #save_model()

    # 2. Test with an example image
    test_image('test_math.png')

    test = "根據經濟部水利署的統計，截至2019年底，臺灣40座主要水庫中，淤積率超過30%的共有15座，例如霧社水庫淤積率達74.8%、烏山頭水庫達49.2%。顯示臺灣水庫淤積程度嚴重，影響水庫蓄水功能。下列何項策略最能有效改善上述現象"
    test_text(test)


Processing: test_math.png


FileNotFoundError: [Errno 2] No such file or directory: 'test_math.png'

## textCNN pipeline

In [13]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast
import json
from tqdm import tqdm

In [16]:
# 3. TextCNN Model Definition
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim,
                      out_channels=num_filters,
                      kernel_size=k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)  # [B, L, D]
        x = x.permute(0, 2, 1)        # [B, D, L]
        convs = [torch.relu(conv(x)) for conv in self.convs]  # list of [B, F, L-k+1]
        pools = [torch.max(c, dim=2)[0] for c in convs]     # list of [B, F]
        out = torch.cat(pools, dim=1)                       # [B, F*len]
        out = self.dropout(out)
        return self.fc(out)

# 4. Training and Evaluation Functions
# 4. Tokenization and Dataset Definition
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
model_cnn = TextCNN(vocab_size=len(tokenizer), embed_dim=128,
                    num_classes=len(label2id)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_cnn.parameters(), lr=1e-3)

from torch.utils.data import DataLoader

# Create DataLoaders for TextCNN training
class TextCNNDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(text, padding='max_length', truncation=True, 
                               max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets and loaders
train_dataset_cnn = TextCNNDataset(train_df['ques_detl'].tolist(), train_df['label'].tolist(), tokenizer)
test_dataset_cnn = TextCNNDataset(test_df['ques_detl'].tolist(), test_df['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset_cnn, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset_cnn, batch_size=32, shuffle=False)

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct = 0, 0
    for batch in tqdm(loader, desc='Train'):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_ids.size(0)
        preds = outputs.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
    return total_loss/len(loader.dataset), total_correct/len(loader.dataset)

def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(loader, desc='Eval'):
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * input_ids.size(0)
            preds = outputs.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
    return total_loss/len(loader.dataset), total_correct/len(loader.dataset)

# 5. Run Training and Compare
epochs = 5
best_acc = 0
for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model_cnn, train_loader, optimizer, criterion)
    val_loss, val_acc = eval_epoch(model_cnn, test_loader, criterion)
    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model_cnn.state_dict(), 'best_textcnn.pt')

print(f"Best TextCNN Val Acc: {best_acc:.4f}")

# 6. Save Mappings for CNN
with open('tokenizer_vocab.json', 'w') as f:
    json.dump(tokenizer.vocab, f)
with open('label2id.json', 'w') as f:
    json.dump(label2id, f)

# DoRA-Enhanced TextCNN Pipeline
from peft import LoraConfig, get_peft_model

# DoRA-Enhanced TextCNN Model
class DoRATextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super().__init__()
        # Base embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        
        # Convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        
        # DoRA-inspired domain adaptation layers
        self.domain_adapters = nn.ModuleList([
            nn.Linear(num_filters, num_filters // 4)  # Low-rank adaptation
            for _ in kernel_sizes
        ])
        
        self.domain_scalers = nn.ModuleList([
            nn.Linear(num_filters // 4, num_filters)  # Scale back up
            for _ in kernel_sizes
        ])
        
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        
        # Initialize domain adaptation weights
        for adapter in self.domain_adapters:
            nn.init.xavier_uniform_(adapter.weight)
        for scaler in self.domain_scalers:
            nn.init.xavier_uniform_(scaler.weight)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)  # [B, L, D]
        x = x.permute(0, 2, 1)        # [B, D, L]
        
        conv_outputs = []
        for i, conv in enumerate(self.convs):
            conv_out = torch.relu(conv(x))  # [B, F, L-k+1]
            
            # Apply DoRA-style domain adaptation
            pooled = torch.max(conv_out, dim=2)[0]  # [B, F]
            adapted = self.domain_adapters[i](pooled)  # [B, F//4]
            scaled = self.domain_scalers[i](adapted)   # [B, F]
            
            # Residual connection with original pooled features
            enhanced = pooled + scaled
            conv_outputs.append(enhanced)
        
        out = torch.cat(conv_outputs, dim=1)  # [B, F*len]
        out = self.dropout(out)
        return self.fc(out)

# Initialize DoRA-enhanced model
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
model_dora_cnn = DoRATextCNN(
    vocab_size=len(tokenizer), 
    embed_dim=128,
    num_classes=len(label2id)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_dora_cnn.parameters(), lr=1e-3)

print(f"DoRA-TextCNN initialized with {sum(p.numel() for p in model_dora_cnn.parameters())} parameters")
print(f"Device: {device}")

Train: 100%|██████████| 1904/1904 [01:12<00:00, 26.14it/s]
Train: 100%|██████████| 1904/1904 [01:12<00:00, 26.14it/s]
Eval: 100%|██████████| 106/106 [00:05<00:00, 20.31it/s]



Epoch 1/5 | Train Loss: 1.9435, Acc: 0.4293 | Val Loss: 1.2461, Acc: 0.6250


Train: 100%|██████████| 1904/1904 [00:53<00:00, 35.88it/s]
Train: 100%|██████████| 1904/1904 [00:53<00:00, 35.88it/s]
Eval: 100%|██████████| 106/106 [00:02<00:00, 35.96it/s]



Epoch 2/5 | Train Loss: 1.3654, Acc: 0.5841 | Val Loss: 1.0608, Acc: 0.6678


Train: 100%|██████████| 1904/1904 [00:53<00:00, 35.46it/s]
Train: 100%|██████████| 1904/1904 [00:53<00:00, 35.46it/s]
Eval: 100%|██████████| 106/106 [00:02<00:00, 36.35it/s]



Epoch 3/5 | Train Loss: 1.1951, Acc: 0.6335 | Val Loss: 1.0054, Acc: 0.6838


Train: 100%|██████████| 1904/1904 [01:08<00:00, 27.84it/s]
Train: 100%|██████████| 1904/1904 [01:08<00:00, 27.84it/s]
Eval: 100%|██████████| 106/106 [00:03<00:00, 30.65it/s]



Epoch 4/5 | Train Loss: 1.0878, Acc: 0.6629 | Val Loss: 0.9571, Acc: 0.7004


Train: 100%|██████████| 1904/1904 [00:50<00:00, 37.99it/s]
Train: 100%|██████████| 1904/1904 [00:50<00:00, 37.99it/s]
Eval: 100%|██████████| 106/106 [00:02<00:00, 36.55it/s]



Epoch 5/5 | Train Loss: 1.0200, Acc: 0.6804 | Val Loss: 0.9266, Acc: 0.7069
Best TextCNN Val Acc: 0.7069
DoRA-TextCNN initialized with 4087830 parameters
Device: mps


In [None]:
# DoRA Training Functions
def train_epoch_dora(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct = 0, 0
    for batch in tqdm(loader, desc='DoRA Train'):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * input_ids.size(0)
        preds = outputs.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
    
    return total_loss/len(loader.dataset), total_correct/len(loader.dataset)

def eval_epoch_dora(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(loader, desc='DoRA Eval'):
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            preds = outputs.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
    
    return total_loss/len(loader.dataset), total_correct/len(loader.dataset)

# Training Loop for DoRA-TextCNN
print("\n=== Training DoRA-Enhanced TextCNN ===")
epochs = 5
best_dora_acc = 0

for epoch in range(epochs):
    train_loss, train_acc = train_epoch_dora(model_dora_cnn, train_loader, optimizer, criterion)
    val_loss, val_acc = eval_epoch_dora(model_dora_cnn, test_loader, criterion)
    
    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
    
    if val_acc > best_dora_acc:
        best_dora_acc = val_acc
        torch.save(model_dora_cnn.state_dict(), 'best_dora_textcnn.pt')
        print(f"  → New best model saved! Accuracy: {val_acc:.4f}")

print(f"\nBest DoRA-TextCNN Val Acc: {best_dora_acc:.4f}")

# Save DoRA model mappings
with open('dora_tokenizer_vocab.json', 'w') as f:
    json.dump(tokenizer.vocab, f)
with open('dora_label2id.json', 'w') as f:
    json.dump(label2id, f)

print("DoRA-TextCNN training completed and model saved!")

# Print model comparison summary
print("\n=== Model Architecture Summary ===")
print(f"DoRA-TextCNN Parameters: {sum(p.numel() for p in model_dora_cnn.parameters()):,}")
print(f"Domain Adaptation Layers: {len(model_dora_cnn.domain_adapters)}")
print(f"Low-rank dimension: {model_dora_cnn.domain_adapters[0].out_features}")
print(f"Full feature dimension: {model_dora_cnn.domain_scalers[0].out_features}")
print(f"Compression ratio: {model_dora_cnn.domain_scalers[0].out_features / model_dora_cnn.domain_adapters[0].out_features:.1f}x")

In [None]:
# DoRA Inference Pipeline
def load_dora_classifier(model_path='best_dora_textcnn.pt'):
    """
    Load the trained DoRA-TextCNN model for inference
    """
    model = DoRATextCNN(
        vocab_size=len(tokenizer),
        embed_dim=128,
        num_classes=len(label2id)
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

def predict_with_dora(text: str, model, tokenizer, top_k=3):
    """
    Predict using DoRA-TextCNN model with confidence scores
    """
    inputs = tokenizer(text, padding='max_length', truncation=True, 
                      max_length=128, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids)
        probs = torch.softmax(outputs, dim=-1)
        top_probs, top_indices = torch.topk(probs, k=top_k, dim=-1)
    
    results = []
    for i in range(top_k):
        idx = top_indices[0][i].item()
        prob = top_probs[0][i].item()
        label = id2label[idx]
        subject, chapter = label.split('::')
        results.append({
            'subject': subject,
            'chapter': chapter,
            'confidence': prob
        })
    
    return results

def test_dora_model(test_texts=None):
    """
    Test the DoRA model with sample texts
    """
    if test_texts is None:
        test_texts = [
            "根據經濟部水利署的統計，截至2019年底，臺灣40座主要水庫中，淤積率超過30%的共有15座",
            "求解二次方程式 x² + 5x + 6 = 0 的解",
            "什麼是三角函數的基本性質？"
        ]
    
    print("\n=== DoRA Model Inference Testing ===")
    model = load_dora_classifier()
    
    for i, text in enumerate(test_texts, 1):
        print(f"\nTest {i}: {text[:50]}...")
        results = predict_with_dora(text, model, tokenizer, top_k=3)
        
        for j, result in enumerate(results, 1):
            print(f"  {j}. {result['subject']} - {result['chapter']} "
                  f"(Confidence: {result['confidence']:.3f})")

# Run the test
if __name__ == '__main__':
    test_dora_model()