In [None]:
# 📌 安裝必要套件（Colab 每次都需要執行）
# !pip install -U transformers datasets

# 📌 匯入必要套件
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import accelerate
print(accelerate.__version__)

# 顯示 transformers 套件版本，確認版本相容性
import transformers
print(transformers.__version__)

# 📌 載入簡訊資料集（請確認檔案已上傳）
df = pd.read_csv("train/train_data_0711_2.csv")  # 載入包含 'text' 和 'label' 欄位的 CSV 檔案

# 將 pandas DataFrame 轉成 Hugging Face 的 Dataset 格式
dataset = Dataset.from_pandas(df[["text", "label"]])

# 將資料集切分成訓練集與測試集，比例為 80% 訓練、20% 測試
dataset = dataset.train_test_split(test_size=0.2)

# 📌 初始化 tokenizer（使用 bert-base-chinese）
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# 📌 定義 tokenize 函式：將每則簡訊轉換成 BERT 模型可以讀懂的格式
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,              # 若文字太長會截斷
        padding="max_length",         # 補齊到固定長度（128）
        max_length=128
    )

# 對資料集進行 tokenize，產生 token 編碼、attention mask 等欄位
tokenized = dataset.map(tokenize)

# 📌 載入 BERT 模型做二分類（label 有兩種可能）
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=2                    # 二分類：例如 是否包含人名 / 詐騙等
)

# 📌 定義訓練參數
training_args = TrainingArguments(
    output_dir="./results",                 # 模型與檔案輸出目錄
    per_device_train_batch_size=16,         # 每個設備的訓練批次大小
    num_train_epochs=3,                     # 訓練週期數
    eval_strategy="epoch",                 # 每個 epoch 評估一次測試集
    save_strategy="epoch",                 # 每個 epoch 儲存一次模型
    logging_dir="./logs",                  # 訓練過程 log 輸出資料夾
    load_best_model_at_end=True,           # 訓練結束時載入最好的模型（根據 eval loss）
    report_to="none"                       # 不上傳訓練記錄到 wandb
)

# 📌 定義指標計算函式（accuracy, precision, recall, F1）
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # 模型預測分數 和 真實標籤
    preds = logits.argmax(axis=1)  # 取最大分數的類別作為預測
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')  # 2分類任務
    acc = accuracy_score(labels, preds)  # 計算準確率
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 📌 初始化 Trainer 物件：整合模型、資料與訓練參數
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],      # 訓練集
    eval_dataset=tokenized["test"],         # 測試集
    compute_metrics=compute_metrics        # 加上這個函式才能顯示準確率與 F1
)

# 📌 執行模型訓練
trainer.train()

# 📌 使用測試集進行評估
results = trainer.evaluate()
print("📊 評估結果：")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

# 📌 儲存訓練完成的模型與 tokenizer（方便未來部署或載入使用）
model.save_pretrained("./results/final")
tokenizer.save_pretrained("./results/final")

# 📌 將儲存的模型打包成壓縮檔，方便下載
!zip -r model_results.zip ./results/final

print("已完成訓練")

1.8.1
4.53.1


Map:   0%|          | 0/1897 [00:00<?, ? examples/s]

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.00872,0.997895,0.995434,1.0,0.997712
2,No log,0.009801,0.997895,0.995434,1.0,0.997712
3,No log,0.017512,0.997895,0.995434,1.0,0.997712




📊 評估結果：
eval_loss: 0.0087
eval_accuracy: 0.9979
eval_precision: 0.9954
eval_recall: 1.0000
eval_f1: 0.9977
eval_runtime: 9.8179
eval_samples_per_second: 48.3810
eval_steps_per_second: 6.1110
epoch: 3.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: results/final/ (stored 0%)
  adding: results/final/model.safetensors (deflated 7%)
  adding: results/final/tokenizer_config.json (deflated 75%)
  adding: results/final/special_tokens_map.json (deflated 42%)
  adding: results/final/config.json (deflated 54%)
  adding: results/final/tokenizer.json (deflated 75%)
  adding: results/final/vocab.txt (deflated 48%)
已完成訓練
