合併所有看板，並刪除字數大於510的文章。

In [None]:
import pandas as pd

r_dir = './data/'
w_dir = './ptt_dataset/'
board_list = ["happy", "hate", "sad"]
MAX_LENGTH = 510

# 載入並合併所有心情看板的檔案
dfs = [pd.read_csv(f"{r_dir}{f}.csv") for f in board_list]
df_train = pd.concat(dfs)

# 過濾字數大於510字的文章，並存檔
df_train = df_train[~(df_train.content.apply(lambda x: len(x)) > MAX_LENGTH)]
df_train.to_csv(f"{w_dir}dataset.csv", index=False)


分割dataset，訓練集、測試集、驗證集的比例為8:1:1。

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

rw_dir = './ptt_dataset/'
random_state = 1410832008

df = pd.read_csv(f"{rw_dir}dataset.csv")
df_y = df['board']
df_x = df

# 訓練集：測試集：驗證集（8:1:1）
X_train, X_test, y_train, y_test = train_test_split(
    df_x, df_y, stratify=df_y, test_size=0.2, random_state=random_state)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.5, random_state=random_state)

# 分別存成 .csv
X_train.to_csv(f"{rw_dir}train.csv", index=False)
X_test.to_csv(f"{rw_dir}test.csv", index=False)
X_val.to_csv(f"{rw_dir}validation.csv", index=False)

print(pd.Series(df_y).value_counts(normalize=True))
print(pd.Series(y_train).value_counts(normalize=True))
print(pd.Series(y_test).value_counts(normalize=True))
print(pd.Series(y_val).value_counts(normalize=True))


預處理
1. 將資料集轉成Transformers datasets格式
2. 載入bert Tokenizer並斷詞
3. 定義每一種心情的label數值

In [3]:
"""預處理"""

from datasets import load_dataset
from transformers import AutoTokenizer

# 將資料集轉成Transformers datasets格式
r_dir = './ptt_dataset/'

data_files = {"train": f"{r_dir}train.csv",
              "test": f"{r_dir}test.csv", "validation": f"{r_dir}validation.csv"}
raw_datasets = load_dataset("csv", data_files=data_files, sep=",")

# 載入 bert Tokenizer
checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 斷詞 function
def tokenize_function(example):
    return tokenizer(example["content"], truncation=True)

# 斷詞
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["content"])

# 定義每一種心情的label數值 
tokenized_datasets = tokenized_datasets.rename_column("board", "labels")
tokenized_datasets = tokenized_datasets.class_encode_column("labels")

Using custom data configuration default-ed1cb8157bbbb21b
Reusing dataset csv (C:\Users\DevilHYT\.cache\huggingface\datasets\csv\default-ed1cb8157bbbb21b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\DevilHYT\.cache\huggingface\datasets\csv\default-ed1cb8157bbbb21b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-54a7a3cf882b552d.arrow
Loading cached processed dataset at C:\Users\DevilHYT\.cache\huggingface\datasets\csv\default-ed1cb8157bbbb21b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-267e4a2abd944cf2.arrow
Loading cached processed dataset at C:\Users\DevilHYT\.cache\huggingface\datasets\csv\default-ed1cb8157bbbb21b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-da341eebc47c18c6.arrow
Loading cached processed dataset at C:\Users\DevilHYT\.cache\huggingface\datasets\csv\default-ed1cb8157bbbb21b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-9af0cff25888864e.arrow
Loading cached processed dataset at C:\Users\DevilHYT\.cache\huggingface\datasets\csv\default-ed1cb8157bbbb21b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2be

載入用於Padding的Data Collator

In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

載入bert分類模型，調整Hyper-parameter，將其移到GPU。

In [5]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-chinese"

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=3,
    classifier_dropout=0.4,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
    ).to("cuda:0")


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

定義計算準確度的Function

In [6]:
import numpy as np
from datasets import load_metric

metric_accuracy = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_accuracy.compute(predictions=predictions, references=labels)


設定Transformers Trainer的參數

In [7]:
from transformers import TrainingArguments

strategy = "epoch"
steps = 90
dir = "bert-base-chinese-20220610-5"

training_args = TrainingArguments(output_dir=dir,
                                  evaluation_strategy=strategy,
                                  eval_steps=steps,
                                  save_strategy=strategy,
                                  save_steps=steps,
                                  logging_strategy=strategy,
                                  logging_steps=steps,
                                  # logging_first_step=True,
                                  report_to="wandb",
                                  run_name="bert-base-chinese",
                                  group_by_length=True,
                                  learning_rate=2e-6,
                                  num_train_epochs=10,
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  gradient_accumulation_steps=4,
                                  gradient_checkpointing=True,
                                  lr_scheduler_type="linear",
                                  warmup_ratio=0.3,
                                  # weight_decay=0.01,
                                  fp16=True,
                                  load_best_model_at_end=True
                                  )


建立Trainer的實例，並指定：
1. 使用的模型
2. Trainer參數
3. 資料集
4. padding的Data Collator
5. 斷詞器
6. 計算準確度的方式

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Using amp half precision backend


開始訓練模型

In [9]:
trainer.train(resume_from_checkpoint=False)


***** Running training *****
  Num examples = 11438
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 1780
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdevilhyt[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1780 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 1.2029, 'learning_rate': 6.629213483146066e-07, 'epoch': 1.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-178
Configuration saved in bert-base-chinese-20220610-5\checkpoint-178\config.json


{'eval_loss': 1.0407233238220215, 'eval_accuracy': 0.47692307692307695, 'eval_runtime': 8.3604, 'eval_samples_per_second': 171.045, 'eval_steps_per_second': 10.765, 'epoch': 1.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-178\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-178\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-178\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.9702, 'learning_rate': 1.3220973782771534e-06, 'epoch': 2.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-356
Configuration saved in bert-base-chinese-20220610-5\checkpoint-356\config.json


{'eval_loss': 0.7164854407310486, 'eval_accuracy': 0.7454545454545455, 'eval_runtime': 8.3749, 'eval_samples_per_second': 170.747, 'eval_steps_per_second': 10.746, 'epoch': 2.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-356\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-356\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-356\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.6151, 'learning_rate': 1.98876404494382e-06, 'epoch': 3.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-534
Configuration saved in bert-base-chinese-20220610-5\checkpoint-534\config.json


{'eval_loss': 0.4260358512401581, 'eval_accuracy': 0.8552447552447553, 'eval_runtime': 8.3859, 'eval_samples_per_second': 170.523, 'eval_steps_per_second': 10.732, 'epoch': 3.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-534\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-534\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-534\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.4159, 'learning_rate': 1.7191011235955056e-06, 'epoch': 4.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-712
Configuration saved in bert-base-chinese-20220610-5\checkpoint-712\config.json


{'eval_loss': 0.34175896644592285, 'eval_accuracy': 0.8797202797202798, 'eval_runtime': 8.4234, 'eval_samples_per_second': 169.765, 'eval_steps_per_second': 10.685, 'epoch': 4.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-712\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-712\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-712\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.3398, 'learning_rate': 1.4349919743178168e-06, 'epoch': 5.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-890
Configuration saved in bert-base-chinese-20220610-5\checkpoint-890\config.json


{'eval_loss': 0.3345033526420593, 'eval_accuracy': 0.8867132867132868, 'eval_runtime': 8.4682, 'eval_samples_per_second': 168.866, 'eval_steps_per_second': 10.628, 'epoch': 5.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-890\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-890\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-890\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.3044, 'learning_rate': 1.1492776886035314e-06, 'epoch': 6.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-1068
Configuration saved in bert-base-chinese-20220610-5\checkpoint-1068\config.json


{'eval_loss': 0.33028340339660645, 'eval_accuracy': 0.8867132867132868, 'eval_runtime': 8.3622, 'eval_samples_per_second': 171.008, 'eval_steps_per_second': 10.763, 'epoch': 6.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-1068\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-1068\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-1068\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.2758, 'learning_rate': 8.635634028892456e-07, 'epoch': 7.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-1246
Configuration saved in bert-base-chinese-20220610-5\checkpoint-1246\config.json


{'eval_loss': 0.29813963174819946, 'eval_accuracy': 0.8986013986013986, 'eval_runtime': 8.438, 'eval_samples_per_second': 169.471, 'eval_steps_per_second': 10.666, 'epoch': 7.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-1246\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-1246\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-1246\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.2626, 'learning_rate': 5.778491171749599e-07, 'epoch': 8.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-1424
Configuration saved in bert-base-chinese-20220610-5\checkpoint-1424\config.json


{'eval_loss': 0.2960302233695984, 'eval_accuracy': 0.9041958041958041, 'eval_runtime': 8.3875, 'eval_samples_per_second': 170.492, 'eval_steps_per_second': 10.73, 'epoch': 8.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-1424\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-1424\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-1424\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.2506, 'learning_rate': 2.921348314606741e-07, 'epoch': 9.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-1602
Configuration saved in bert-base-chinese-20220610-5\checkpoint-1602\config.json


{'eval_loss': 0.2880263924598694, 'eval_accuracy': 0.9034965034965035, 'eval_runtime': 8.3638, 'eval_samples_per_second': 170.975, 'eval_steps_per_second': 10.761, 'epoch': 9.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-1602\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-1602\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-1602\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1430
  Batch size = 16


{'loss': 0.2466, 'learning_rate': 6.420545746388443e-09, 'epoch': 10.0}


  0%|          | 0/90 [00:00<?, ?it/s]

Saving model checkpoint to bert-base-chinese-20220610-5\checkpoint-1780
Configuration saved in bert-base-chinese-20220610-5\checkpoint-1780\config.json


{'eval_loss': 0.29309555888175964, 'eval_accuracy': 0.9034965034965035, 'eval_runtime': 8.425, 'eval_samples_per_second': 169.734, 'eval_steps_per_second': 10.683, 'epoch': 10.0}


Model weights saved in bert-base-chinese-20220610-5\checkpoint-1780\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\checkpoint-1780\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\checkpoint-1780\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-base-chinese-20220610-5\checkpoint-1602 (score: 0.2880263924598694).


{'train_runtime': 1107.6024, 'train_samples_per_second': 103.268, 'train_steps_per_second': 1.607, 'train_loss': 0.48838024139404296, 'epoch': 10.0}


TrainOutput(global_step=1780, training_loss=0.48838024139404296, metrics={'train_runtime': 1107.6024, 'train_samples_per_second': 103.268, 'train_steps_per_second': 1.607, 'train_loss': 0.48838024139404296, 'epoch': 10.0})

<img src="./image/accuracy.png" width="30%">
<img src="./image/loss.png" width="30%">

清除Vram中的殘留檔

In [10]:
import torch
torch.cuda.empty_cache()


驗證測試集的預測準確度，達到90%。

In [11]:
trainer.predict(test_dataset=tokenized_datasets["test"])


***** Running Prediction *****
  Num examples = 1430
  Batch size = 16


  0%|          | 0/90 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-1.728 , -1.357 ,  3.428 ],
       [ 3.666 , -1.156 , -1.995 ],
       [-1.134 ,  2.732 , -1.078 ],
       ...,
       [-2.086 ,  3.137 , -0.5977],
       [ 3.66  , -1.509 , -1.566 ],
       [ 3.43  , -1.1045, -1.3   ]], dtype=float16), label_ids=array([2, 0, 1, ..., 1, 0, 0], dtype=int64), metrics={'test_loss': 0.30578866600990295, 'test_accuracy': 0.9, 'test_runtime': 8.6411, 'test_samples_per_second': 165.489, 'test_steps_per_second': 10.415})

儲存模型

In [15]:
trainer.save_state()
trainer.save_model()

Saving model checkpoint to bert-base-chinese-20220610-5
Configuration saved in bert-base-chinese-20220610-5\config.json
Model weights saved in bert-base-chinese-20220610-5\pytorch_model.bin
tokenizer config file saved in bert-base-chinese-20220610-5\tokenizer_config.json
Special tokens file saved in bert-base-chinese-20220610-5\special_tokens_map.json
