將標籤轉換為數字（Label Encoding）

In [None]:
import pandas as pd
from datasets import Dataset

# 假設 CSV 格式
df = pd.read_excel('your-data-path')  # 用你的資料集路徑

# 將標籤轉換為數字，可換成自己檔案內的分類
label2id = {
    "時刻表": 0,
    "票價": 1,
    "乘車時間": 2,
    "其他問題": 3
}

df['label'] = df['label'].map(label2id)

# 轉換為 HuggingFace dataset 格式
dataset = Dataset.from_pandas(df)

# 你也可以手動分訓練集和測試集
dataset = dataset.train_test_split(test_size=0.2)

dataset

BERT Tokenizer 預處理

In [None]:
from transformers import AutoTokenizer

# 加載 RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
# 定義預處理函數
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)


# 對整個資料集進行 Tokenize
encoded_dataset = dataset.map(preprocess_function, batched=True)

# 顯示處理後的結果
print(encoded_dataset)


In [None]:
# 顯示處理後的結果
print(encoded_dataset["train"][0])

微調（Fine-tuning）

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# 載入模型，num_labels為資料有幾種分類
model = RobertaForSequenceClassification.from_pretrained("hfl/chinese-roberta-wwm-ext", num_labels=4)

# 定義訓練參數
training_args = TrainingArguments(
    output_dir="../results",          # 儲存模型的路徑
    eval_strategy="epoch",           # 每訓練一個epoch後評估一次
    learning_rate=2e-5,              # 學習率
    per_device_train_batch_size=16,  # 訓練時每個設備的 batch size
    per_device_eval_batch_size=16,   # 測試時每個設備的 batch size
    num_train_epochs=3,              # 訓練的 epoch 次數
    weight_decay=0.01,               # 權重衰減
    save_steps=1000,                 # 每 1000 步儲存一次模型
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"]
)

# 開始訓練
trainer.train()


In [None]:
# 儲存模型
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model.save_pretrained("output_dir")
tokenizer.save_pretrained("output_dir")


測試模型

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import pipeline

# 載入訓練後的模型，output_dir可改成其他路徑
model = RobertaForSequenceClassification.from_pretrained("output_dir")
tokenizer = AutoTokenizer.from_pretrained("output_dir")
# 初始化推理管道
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# 測試
result = classifier("your-question")
print(result)


In [None]:
result[0]["label"]

重新訓練模型

In [None]:
df = pd.read_excel("your-data-path")

# 將標籤轉換為數字
label2id = {
    "時刻表": 0,
    "票價": 1,
    "乘車時間": 2,
    "其他問題": 3
}

df['label'] = df['label'].map(label2id)

# 轉換為 HuggingFace dataset 格式
dataset = Dataset.from_pandas(df)

# 分成訓練集和測試集
dataset = dataset.train_test_split(test_size=0.2)

dataset

使用舊模型的 tokenizer 進行 tokenize

In [None]:
from transformers import AutoTokenizer
# output_dir為之前儲存tokenizer的路徑
tokenizer = AutoTokenizer.from_pretrained("output_dir")  # 用之前訓練好的模型目錄

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)


載入已訓練好的模型

In [None]:
from transformers import RobertaForSequenceClassification
# output_dir為之前儲存模型的路徑
model = RobertaForSequenceClassification.from_pretrained("output_dir")


設定訓練參數並用 Trainer 繼續訓練

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results_continue",  # 新的輸出目錄
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    logging_dir="./logs_continue",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
)

# 開始再訓練
trainer.train()


In [None]:
# 儲存模型
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model.save_pretrained("output_dir")
tokenizer.save_pretrained("output_dir")