# 模型訓練

- 資料載入
- Tokenizer
- Dataset 建立
- BERT 訓練
- 模型評估與儲存

In [5]:
from transformers import TrainingArguments
args = TrainingArguments(output_dir="test", evaluation_strategy="epoch")
print("✅ 成功支援 evaluation_strategy")


✅ 成功支援 evaluation_strategy


In [6]:
import torch
print("CUDA 可用:", torch.cuda.is_available())
print("GPU 名稱:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "無")


CUDA 可用: True
GPU 名稱: NVIDIA GeForce RTX 2070 SUPER


In [7]:
# ✅ 1. 套件與資料匯入
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# 檢查 GPU 是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ 使用裝置:", device)

# 讀取已整合的資料集（請確認路徑正確）
df = pd.read_csv("../data/processed/combined_multilang.csv")  # 應包含 'text' 與 'label' 欄位
df.head()

✅ 使用裝置: cuda


Unnamed: 0,text,label
0,awww thats a bummer you shoulda got david car...,0
1,is upset that he cant update his facebook by t...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no its not behaving at all im mad why am i her...,0


In [8]:
# ✅ 2. 資料切分
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# import re
# from tabulate import tabulate

# # ✅ 檢查是否為中文
# def is_chinese(text):
#     return bool(re.search(r'[\u4e00-\u9fff]', str(text)))

# # ✅ 計算語言類別（不修改原始 df）
# lang_series = df["text"].apply(lambda x: "ch" if is_chinese(x) else "en")

# # ✅ 切分資料（含語言標籤）
# train_texts, val_texts, train_labels, val_labels, train_langs, val_langs = train_test_split(
#     df["text"], df["label"], lang_series, test_size=0.2, random_state=42
# )

# # ✅ 建立 DataFrame 用於統計
# train_df = pd.DataFrame({"text": train_texts, "label": train_labels, "lang": train_langs})
# val_df = pd.DataFrame({"text": val_texts, "label": val_labels, "lang": val_langs})

# # ✅ 自訂統計函數
# def compute_custom_stats(df, kind):
#     stats = df.groupby(["lang", "label"]).size().unstack(fill_value=0)
#     stats[0] = stats.get(0, 0)
#     stats[1] = stats.get(1, 0)
#     stats["總數"] = stats[0] + stats[1]
#     stats["0(負面)占比"] = stats[0] / stats["總數"]
#     stats["1(正面)占比"] = stats[1] / stats["總數"]
#     stats["種類"] = kind
#     stats = stats.reset_index()
#     stats = stats.rename(columns={0: "0(負面)數量", 1: "1(正面)數量", "lang": "語言"})
#     return stats[["語言", "0(負面)數量", "1(正面)數量", "0(負面)占比", "1(正面)占比", "種類"]]

# # ✅ 合併統計資料
# train_stats = compute_custom_stats(train_df, "train")
# val_stats = compute_custom_stats(val_df, "val")
# all_stats = pd.concat([train_stats, val_stats], ignore_index=True)

# # ✅ 格式化小數點
# all_stats["0(負面)占比"] = all_stats["0(負面)占比"].round(4)
# all_stats["1(正面)占比"] = all_stats["1(正面)占比"].round(4)

# # ✅ 美化表格輸出
# print(tabulate(
#     all_stats,
#     headers=["語言", "0(負面)數量", "1(正面)數量", "0(負面)占比", "1(正面)占比", "種類"],
#     tablefmt="grid",
#     showindex=False
# ))


+--------+---------------+---------------+---------------+---------------+--------+
| 語言   |   0(負面)數量 |   1(正面)數量 |   0(負面)占比 |   1(正面)占比 | 種類   |
| ch     |          3842 |          3854 |        0.4992 |        0.5008 | train  |
+--------+---------------+---------------+---------------+---------------+--------+
| en     |        613870 |        602336 |        0.5047 |        0.4953 | train  |
+--------+---------------+---------------+---------------+---------------+--------+
| ch     |           957 |           944 |        0.5034 |        0.4966 | val    |
+--------+---------------+---------------+---------------+---------------+--------+
| en     |        153209 |        150866 |        0.5039 |        0.4961 | val    |
+--------+---------------+---------------+---------------+---------------+--------+


In [9]:
# ✅ 3. 載入多語 BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize 函數
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128)



In [10]:
# ✅ 4. 對訓練與驗證資料進行 tokenization
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [11]:
# ✅ 5. 自訂 Dataset 包裝器
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [24]:
# ✅ 6. 載入模型
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=2
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# ✅ 7. 定義評估指標
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


In [None]:
# ✅ 8. 訓練參數設定
training_args = TrainingArguments(
    output_dir="outputs/bert_model",
    do_train=True,
    num_train_epochs=1,  # ✅ 先設為 1，看是否會成功跑完儲存
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",               # ✅ 儲存時機
    logging_dir="outputs/logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,                  # ✅ 最多保留 2 個 checkpoint
    fp16=True,                           # ✅ 混合精度訓練（需 GPU）
    overwrite_output_dir=True            # ✅ 若資料夾存在，自動覆蓋
)


In [38]:
# ✅ 9. Trainer 訓練器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [45]:
# ✅ 10. 開始訓練
trainer.train()

  3%|▎         | 2037/76494 [10:25<6:20:52,  3.26it/s]
  3%|▎         | 2130/76494 [00:40<23:35, 52.52it/s]  
  0%|          | 42/76494 [00:06<3:09:29,  6.72it/s]

KeyboardInterrupt: 

In [None]:
# ✅ 11. 儲存模型與 tokenizer
model.save_pretrained("models/bert_multilang")
tokenizer.save_pretrained("models/bert_multilang")
print("✅ 模型與 tokenizer 已儲存至 models/bert_multilang")

# 使用checkpoint 繼續訓練
前5步驟照執行之前的就好，第6步開始執行以下版本
- 重新載入 tokenizer 與模型
- 重新定義 training_args 並指定 resume_from_checkpoint

In [None]:
# 手動儲存目前model checkpoint
trainer.save_model("outputs/bert_model_manual_save")
import os
save_path = "outputs/checkpoint-manual"

os.makedirs(save_path, exist_ok=True)
trainer.save_model(save_path)  # 儲存模型 (pytorch_model.bin + config.json)
tokenizer.save_pretrained(save_path)  # 儲存 tokenizer

# 儲存 optimizer / scheduler / trainer state
torch.save(trainer.optimizer.state_dict(), os.path.join(save_path, "optimizer.pt"))
torch.save(trainer.lr_scheduler.state_dict(), os.path.join(save_path, "scheduler.pt"))
trainer.state.save_to_json(os.path.join(save_path, "trainer_state.json"))


In [40]:
print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("outputs/checkpoint-manual")
tokenizer = BertTokenizer.from_pretrained("outputs/checkpoint-manual")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs/bert_model",  # 可與原本不同
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="outputs/logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    fp16=True,
    resume_from_checkpoint=True,  # ✅ 指定要從 checkpoint 繼續
)


In [4]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train(resume_from_checkpoint="outputs/checkpoint-manual")


NameError: name 'train_dataset' is not defined

# 先用小資料及測試
- 小資料集的載入與分析（1000筆）
- tokenizer 與 Dataset 包裝
- Trainer 建立與訓練執行

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# ✅ 判斷是否為中文
def is_chinese(text):
    return bool(re.search(r'[\u4e00-\u9fff]', str(text)))

# ✅ 讀取原始資料
df = pd.read_csv("../data/processed/combined_multilang.csv")

# ✅ 加入語言分類欄位
df["lang"] = df["text"].apply(lambda x: "ch" if is_chinese(x) else "en")

# ✅ 抽樣小型資料集（1000 筆）
df_sampled = df.sample(n=1000, random_state=42).reset_index(drop=True)

# ✅ 分割訓練集與驗證集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_sampled["text"], df_sampled["label"], test_size=0.2, random_state=42
)
train_langs, val_langs = train_test_split(df_sampled["lang"], test_size=0.2, random_state=42)

# ✅ 包裝成 DataFrame
train_df = pd.DataFrame({"text": train_texts, "label": train_labels, "lang": train_langs})
val_df = pd.DataFrame({"text": val_texts, "label": val_labels, "lang": val_langs})

# ✅ 統計函數
def compute_custom_stats(df, kind):
    stats = df.groupby(["lang", "label"]).size().unstack(fill_value=0)
    stats[0] = stats.get(0, 0)
    stats[1] = stats.get(1, 0)
    stats["總數"] = stats[0] + stats[1]
    stats["0(負面)占比"] = stats[0] / stats["總數"]
    stats["1(正面)占比"] = stats[1] / stats["總數"]
    stats["種類"] = kind
    stats = stats.reset_index()
    stats = stats.rename(columns={0: "0(負面)數量", 1: "1(正面)數量", "lang": "語言"})
    return stats[["語言", "0(負面)數量", "1(正面)數量", "0(負面)占比", "1(正面)占比", "種類"]]

# ✅ 印出統計
train_stats = compute_custom_stats(train_df, "train")
val_stats = compute_custom_stats(val_df, "val")
all_stats = pd.concat([train_stats, val_stats], ignore_index=True)
all_stats["0(負面)占比"] = all_stats["0(負面)占比"].round(4)
all_stats["1(正面)占比"] = all_stats["1(正面)占比"].round(4)

# ✅ 顯示統計
print(tabulate(
    all_stats,
    headers=["語言", "0(負面)數量", "1(正面)數量", "0(負面)占比", "1(正面)占比", "種類"],
    tablefmt="grid",
    showindex=False
))

+--------+---------------+---------------+---------------+---------------+--------+
| 語言   |   0(負面)數量 |   1(正面)數量 |   0(負面)占比 |   1(正面)占比 | 種類   |
| ch     |             5 |             2 |        0.7143 |        0.2857 | train  |
+--------+---------------+---------------+---------------+---------------+--------+
| en     |           381 |           412 |        0.4805 |        0.5195 | train  |
+--------+---------------+---------------+---------------+---------------+--------+
| ch     |             1 |             1 |        0.5    |        0.5    | val    |
+--------+---------------+---------------+---------------+---------------+--------+
| en     |           101 |            97 |        0.5101 |        0.4899 | val    |
+--------+---------------+---------------+---------------+---------------+--------+


In [3]:
import torch
from transformers import BertTokenizer

# ✅ 載入多語 BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# ✅ Tokenize 函數（和你原來的相同）
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128)

# ✅ 對小資料集的訓練與驗證資料進行 tokenization
small_train_encodings = tokenize_function(train_df["text"])
small_val_encodings = tokenize_function(val_df["text"])

# ✅ Dataset 包裝器
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))  # ✅ 確保是 int
        return item
    def __len__(self):
        return len(self.labels)

# ✅ 包裝小資料集
small_train_dataset = SentimentDataset(small_train_encodings, train_df["label"].tolist())
small_val_dataset = SentimentDataset(small_val_encodings, val_df["label"].tolist())



In [6]:
from transformers import Trainer

# ✅ 載入 tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# ✅ Tokenize 小資料集
small_train_encodings = tokenizer(train_df["text"].tolist(), padding=True, truncation=True, max_length=128)
small_val_encodings = tokenizer(val_df["text"].tolist(), padding=True, truncation=True, max_length=128)

# ✅ 自訂 Dataset 包裝器
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))  # 確保是 int 型態
        return item
    def __len__(self):
        return len(self.labels)

# ✅ 包裝小資料集
small_train_dataset = SentimentDataset(small_train_encodings, train_df["label"].tolist())
small_val_dataset = SentimentDataset(small_val_encodings, val_df["label"].tolist())

# ✅ 定義評估指標函數（沿用）
def compute_metrics(eval_pred):
    import numpy as np
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

from transformers import BertForSequenceClassification

# ✅ 載入 BERT 模型並指定要分類 2 類
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./outputs/bert_model_test",     # 模型儲存路徑
    num_train_epochs=1,                         # 訓練 epoch 數量
    per_device_train_batch_size=8,              # 每個裝置的訓練批次大小
    per_device_eval_batch_size=8,               # 每個裝置的驗證批次大小
    evaluation_strategy="epoch",                # 每個 epoch 驗證
    save_strategy="epoch",                      # 每個 epoch 儲存模型
    logging_dir="./outputs/logs",               # 日誌儲存路徑
    logging_strategy="epoch",                   # 每個 epoch 紀錄 log
    load_best_model_at_end=True,                # 根據 eval loss 自動載入最佳模型
)


# ✅ 建立 Trainer 並指定小資料集
trainer = Trainer(
    model=model,
    args=training_args,  # 如果這邊太大，也可以降低 epoch 或 batch size
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)

# ✅ 開始訓練
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:15<00:00,  6.86it/s]

{'loss': 0.6723, 'grad_norm': 5.030803680419922, 'learning_rate': 0.0, 'epoch': 1.0}


                                                 
100%|██████████| 100/100 [00:16<00:00,  6.86it/s]

{'eval_loss': 0.6522153615951538, 'eval_accuracy': 0.66, 'eval_f1': 0.728, 'eval_precision': 0.5986842105263158, 'eval_recall': 0.9285714285714286, 'eval_runtime': 0.9022, 'eval_samples_per_second': 221.679, 'eval_steps_per_second': 27.71, 'epoch': 1.0}


100%|██████████| 100/100 [00:36<00:00,  2.77it/s]

{'train_runtime': 36.106, 'train_samples_per_second': 22.157, 'train_steps_per_second': 2.77, 'train_loss': 0.6723469543457031, 'epoch': 1.0}





TrainOutput(global_step=100, training_loss=0.6723469543457031, metrics={'train_runtime': 36.106, 'train_samples_per_second': 22.157, 'train_steps_per_second': 2.77, 'total_flos': 52622211072000.0, 'train_loss': 0.6723469543457031, 'epoch': 1.0})

In [15]:
# 測試
test_sentences = [
    "I am very happy with this product!",
    "這東西爛透了。",
    "The service was okay, but not great.",
    "我非常滿意這次的購買經驗。",
    "爛",
    "好"
]

test_encodings = tokenizer(
    test_sentences,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# ✅ 搬資料到同一個裝置（模型所在裝置）
test_encodings = {k: v.to(model.device) for k, v in test_encodings.items()}

# ✅ 預測
with torch.no_grad():
    outputs = model(**test_encodings)
    predictions = torch.argmax(outputs.logits, dim=1)

# ✅ 顯示結果
for sentence, pred in zip(test_sentences, predictions):
    label = "正面" if pred.item() == 1 else "負面"
    print(f"句子：{sentence}\n→ 預測結果：{label}\n")


句子：I am very happy with this product!
→ 預測結果：正面

句子：這東西爛透了。
→ 預測結果：正面

句子：The service was okay, but not great.
→ 預測結果：正面

句子：我非常滿意這次的購買經驗。
→ 預測結果：正面

句子：爛
→ 預測結果：正面

句子：好
→ 預測結果：正面

