### 確認 GPU

In [1]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 3060 Ti (UUID: GPU-8b899a89-6fce-5185-fb3f-956158e0eace)


In [2]:
!chcp 65001

Active code page: 65001


### 實作 BERT 情緒分類 



dataset 來源: https://github.com/SophonPlus/ChineseNlpCorpus

#### 外賣資料集格式
label | review |
:--- | :--- |
`1`代表正面<br><br>`0`代表負面 |  評論內容 |

In [3]:
import pandas as pd
pd_all = pd.read_csv('./waimai_10k_zh_tw.csv')

print('評論總數：%d' % pd_all.shape[0])
print('正面評論：%d' % pd_all[pd_all.label==1].shape[0])
print('負面評論：%d' % pd_all[pd_all.label==0].shape[0])

評論總數：11987
正面評論：4000
負面評論：7987


In [5]:
pd_all.sample(10)

Unnamed: 0,label,review
7264,0,餐品送錯，告知也不理睬，不負責任
3251,1,這次送餐很快
4070,0,不是現做的沒事，至少您也熱一下吧。
770,1,東西有點硬，不及別的地方吃的鬆軟
7113,0,給送的菜都不全，我要兩份給送一份，您家倒是挺省呀？
11959,0,送了快兩個鐘頭才到……
9448,0,送的慢，態度差
4034,0,"沒給吸管,怎麼喝奶茶"
10294,0,肥牛根本不是肥牛，乾巴巴的
6802,0,油豆皮沒送


#### 切 training data 及 testing data

In [4]:
# 將要訓練的句子存起來
def save_sentence(filepath, sent_list):
    f = open(filepath, 'w', encoding='utf-8')
    for sent in sent_list:
        f.write(sent + '\n')
    f.close()

# 分割 train_data 和 test_data
def split_data(filename):

    fp = open(filename, 'r', encoding='utf-8')
    line = fp.readline() # 第一行是label, review
    line = fp.readline()

    train_sent_num = 3000
    test_sent_num = 1000
    # 計算目前資料筆數
    train_positive_num = 0 
    train_negative_num = 0
    test_positive_num = 0
    test_negative_num = 0
    train_data = []
    test_data = []

    # 用 while 逐行讀取檔案內容，直至檔案結尾
    while line:
        sent = ''
        sent = line.replace('\n', '')

        if line[:2] == '1,':
            if train_positive_num < train_sent_num:
                train_data.append(sent)
                train_positive_num += 1
            elif test_positive_num < test_sent_num:
                test_data.append(sent)
                test_positive_num += 1
        else:
            if train_negative_num < train_sent_num:
                train_data.append(sent)
                train_negative_num += 1
            elif test_negative_num < test_sent_num:
                test_data.append(sent)
                test_negative_num += 1
        
        line = fp.readline()
    
    fp.close()

    save_sentence('./train_data.txt', train_data)
    save_sentence('./test_data.txt', test_data)

In [5]:
split_data('waimai_10k_zh_tw.csv')

In [6]:
ls

 Volume in drive C has no label.
 Volume Serial Number is 7299-E1A3

 Directory of c:\Users\Dan\Desktop\aiot_recommand

2023/01/04  上午 01:52    <DIR>          .
2023/01/04  上午 01:52    <DIR>          ..
2023/01/04  下午 05:41           103,778 aiot_sentiment.ipynb
2022/12/30  上午 02:06           286,661 bart_summarization.ipynb
2022/12/31  上午 12:43            74,740 clussum_data2.json
2022/12/29  上午 03:28         1,185,984 comments_cluser.json
2022/12/29  上午 03:28         1,262,081 comments_set.json
2022/12/29  下午 11:05            21,086 crawler.ipynb
2022/12/29  下午 11:08             3,881 kmean_cluster.ipynb
2023/01/03  下午 09:22           181,571 predict.ipynb
2022/12/29  下午 11:01    <DIR>          summarize_model
2023/01/04  下午 05:41           151,752 test_data.txt
2023/01/04  下午 05:41           436,529 train_data.txt
2022/12/29  上午 01:54    <DIR>          trained_model
2022/12/29  上午 01:14           919,380 waimai_10k_zh_tw.csv
2023/01/03  下午 09:20    <DIR>          美食推薦系統
            

#### 載入相關套件

In [8]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [9]:
import torch
from transformers import (AdamW, BertForSequenceClassification, BertTokenizerFast,
                          Trainer, TrainingArguments)

  from .autonotebook import tqdm as notebook_tqdm


#### 讀取資料（整理出資料中的 sentence、label）

In [10]:
def read_waimai(path):
    with open(path,'r',encoding='utf-8') as f:
        data = f.read()
    LS_pairs = data.split("\n")
    
    texts = []
    labels = []
    for LS_pair in LS_pairs:
        if LS_pair != "":
            try:
                L = LS_pair[:1]
                S = LS_pair[2:]
                texts.append(S)
                labels.append(int(L))
            except:
                continue
    return texts, labels

train_texts, train_labels = read_waimai('./train_data.txt')
test_texts, test_labels = read_waimai('./test_data.txt')

In [13]:
print(train_texts[0], train_labels[0])

很快，好吃，味道足，量大 1


#### Tokenize（將資料轉換成 token id 、tpye_id 與 attention_mask）
`truncation=True` 和 `padding=True`

確保輸入不超過模型的最大限制以及讓每一筆資料長度都一樣

In [11]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [12]:
print(train_encodings[0].ids)
print(tokenizer.decode(train_encodings[0].ids))

print(train_encodings[0].type_ids)
print(train_encodings[0].attention_mask)

[101, 2523, 2571, 8024, 1962, 1391, 8024, 1456, 6887, 6639, 8024, 7030, 1920, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### 定義 Dataset，並轉換成 tensor 格式

In [13]:
class WaimaiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = WaimaiDataset(train_encodings, train_labels)
test_dataset = WaimaiDataset(test_encodings, test_labels)

In [17]:
print(train_dataset[0])

{'input_ids': tensor([ 101, 2523, 2571, 8024, 1962, 1391, 8024, 1456, 6887, 6639, 8024, 7030,
        1920,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 

#### Train

In [14]:
# 設定訓練參數
training_args = TrainingArguments(
    output_dir='./trained_model',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)

# 使用 bert-base-chinese 預訓練模型
model = BertForSequenceClassification.from_pretrained("bert-base-chinese")

# 將參數與資料丟入 trainer 
trainer = Trainer(
    model=model,                 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
%%time
trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1500
 33%|███▎      | 500/1500 [03:34<06:54,  2.41it/s]Saving model checkpoint to ./trained_model\checkpoint-500
Configuration saved in ./trained_model\checkpoint-500\config.json


{'loss': 0.4226, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.67}


Model weights saved in ./trained_model\checkpoint-500\pytorch_model.bin
 67%|██████▋   | 1000/1500 [07:11<03:34,  2.33it/s]Saving model checkpoint to ./trained_model\checkpoint-1000
Configuration saved in ./trained_model\checkpoint-1000\config.json


{'loss': 0.3357, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.33}


Model weights saved in ./trained_model\checkpoint-1000\pytorch_model.bin
100%|██████████| 1500/1500 [10:46<00:00,  2.29it/s]Saving model checkpoint to ./trained_model\checkpoint-1500
Configuration saved in ./trained_model\checkpoint-1500\config.json


{'loss': 0.2906, 'learning_rate': 0.0, 'epoch': 2.0}


Model weights saved in ./trained_model\checkpoint-1500\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1500/1500 [10:48<00:00,  2.31it/s]

{'train_runtime': 648.2709, 'train_samples_per_second': 18.511, 'train_steps_per_second': 2.314, 'train_loss': 0.3496396484375, 'epoch': 2.0}
CPU times: total: 10min 43s
Wall time: 10min 48s





TrainOutput(global_step=1500, training_loss=0.3496396484375, metrics={'train_runtime': 648.2709, 'train_samples_per_second': 18.511, 'train_steps_per_second': 2.314, 'train_loss': 0.3496396484375, 'epoch': 2.0})

In [20]:
# 儲存模型
trainer.save_model("./trained_model")

Saving model checkpoint to ./trained_model
Configuration saved in ./trained_model\config.json
Model weights saved in ./trained_model\pytorch_model.bin


#### Inference

In [21]:
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
config = BertConfig.from_pretrained("./trained_model/config.json") 
model = BertForSequenceClassification.from_pretrained("./trained_model/pytorch_model.bin", config=config)

sentence = "這間餐廳的東西很好吃。"

inputs = tokenizer(sentence, return_tensors="pt")
outputs = model(**inputs)
predicts = outputs[0]
max_val = torch.max(predicts)
predict_label = (predicts == max_val).nonzero().numpy()[0][1]

if str(predict_label) == '1':
    print('正面')
else:
    print('負面')

loading file vocab.txt from cache at C:\Users\Dan/.cache\huggingface\hub\models--bert-base-chinese\snapshots\84b432f646e4047ce1b5db001d43a348cd3f6bd0\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\Dan/.cache\huggingface\hub\models--bert-base-chinese\snapshots\84b432f646e4047ce1b5db001d43a348cd3f6bd0\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\Dan/.cache\huggingface\hub\models--bert-base-chinese\snapshots\84b432f646e4047ce1b5db001d43a348cd3f6bd0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,


正面
