In [None]:
# 📌 安裝必要套件（Colab 每次都需要執行）
!pip install -U transformers datasets

# 📌 匯入必要套件
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import transformers
print(transformers.__version__)

# 📌 載入資料集（請確認 sms_gpt_f2.csv 檔案已上傳）
df = pd.read_csv("/content/drive/MyDrive/sms_gpt_f2.csv")  # 需包含 "text" 和 "label" 欄位
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.2)

# 📌 初始化 tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# 📌 Tokenize 資料
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized = dataset.map(tokenize)

# 📌 載入 BERT 模型做二分類
model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

# 📌 訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none"  # ✅ 關掉 wandb
)

# 📌 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"]
)

# 📌 訓練與評估
trainer.train()
trainer.evaluate()

# 📌 儲存模型與 tokenizer
model.save_pretrained("./results/final")
tokenizer.save_pretrained("./results/final")

!zip -r model_results.zip ./results/final

