In [None]:
# 📌 安裝必要套件（Colab 每次都需要執行）
# !pip install -U transformers datasets

# 📌 Step 1: 匯入套件
import pandas as pd
import re
# 📌 Step 2: 匯入 Hugging Face 的分類訓練工具
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# 📌 Step 3: 載入簡訊資料與轉換格式
df = pd.read_csv("sms_gpt_f2.csv")  # 請替換成你自己的 CSV 檔，需包含 "text" 欄位
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.2)

# 📌 Step 4: Tokenizer 處理
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

def tokenize(ex):
    return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=128)


tokenized = dataset.map(tokenize)

# 📌 Step 5: 建立 BERT 分類模型
model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

# 📌 Step 6: 設定訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    #eval_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True
)

# 📌 Step 7: 訓練與評估模型
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"]
)

trainer.train()
trainer.evaluate()

# 📌 Step 8: 儲存模型與 tokenizer
tokenizer.save_pretrained("./results")
model.save_pretrained("./results")