## 單純用BERT去預測一個句子的label (label是從aspect polarity來的)，目的是測試純positve跟negative到底準不準；把label轉成onehot的形式，看能不能跑，為之後LSTM、bert一起訓練作準備 ----->失敗

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

### 對處理好的laptop、restaurant的train、test資料作前處理

In [2]:
laptop_train = pd.read_csv('dataset/laptop_train_processed.csv', encoding='utf-8')
restaurant_train = pd.read_csv('dataset/restaurant_train_processed.csv', encoding='utf-8')
laptop_test = pd.read_csv('dataset/laptop_test_processed.csv', encoding='utf-8')
restaurant_test = pd.read_csv('dataset/restaurant_test_processed.csv', encoding='utf-8')

# 把train的資料串在一起且多加一個aspect
train_data = laptop_train.append(restaurant_train)
train_data = train_data.reset_index(drop=True)

# 把test的資料串在一起且多加一個aspect
test_data = laptop_test.append(restaurant_test)
test_data = test_data.reset_index(drop=True)

print('訓練資料數:', len(train_data))
print('筆電測試資料數:', len(laptop_test))
print('餐廳測試資料數:', len(restaurant_test))
print('測試資料數:', len(test_data))
train_data.tail(10)

訓練資料數: 5915
筆電測試資料數: 638
餐廳測試資料數: 1120
測試資料數: 1758


Unnamed: 0,text,aspect,polarity
5905,"From the appetizers we ate, the dim sum and ot...",appetizers,positive
5906,"From the appetizers we ate, the dim sum and ot...",dim sum,positive
5907,"From the appetizers we ate, the dim sum and ot...",foods,positive
5908,"From the appetizers we ate, the dim sum and ot...",food,positive
5909,Each table has a pot of boiling water sunken i...,table,neutral
5910,Each table has a pot of boiling water sunken i...,pot of boiling water,neutral
5911,Each table has a pot of boiling water sunken i...,meats,neutral
5912,Each table has a pot of boiling water sunken i...,vegetables,neutral
5913,Each table has a pot of boiling water sunken i...,rice,neutral
5914,Each table has a pot of boiling water sunken i...,glass noodles,neutral


In [3]:
#把polarity變成數字的label，positive是2，neutral是1，negative是0
train_data.loc[train_data['polarity']=='positive', 'label'] = 2
train_data.loc[train_data['polarity']=='negative', 'label'] = 0
train_data.loc[train_data['polarity']=='neutral', 'label'] = 1
train_data['label'] = train_data['label'].astype(int)

test_data.loc[test_data['polarity']=='positive', 'label'] = 2
test_data.loc[test_data['polarity']=='negative', 'label'] = 0
test_data.loc[test_data['polarity']=='neutral', 'label'] = 1
test_data['label'] = test_data['label'].astype(int)

test_data.head(10)

Unnamed: 0,text,aspect,polarity,label
0,"Boot time is super fast, around anywhere from ...",Boot time,positive,2
1,tech support would not fix the problem unless ...,tech support,negative,0
2,Set up was easy.,Set up,positive,2
3,Did not enjoy the new Windows 8 and touchscree...,Windows 8,negative,0
4,Did not enjoy the new Windows 8 and touchscree...,touchscreen functions,negative,0
5,Other than not being a fan of click pads (indu...,internal speakers,negative,0
6,Other than not being a fan of click pads (indu...,price tag,positive,2
7,Other than not being a fan of click pads (indu...,click pads,negative,0
8,No installation disk (DVD) is included.,installation disk (DVD),neutral,1
9,"It's fast, light, and simple to use.",use,positive,2


### Bert資料前處理

In [4]:
import tensorflow as tf
from transformers import BertTokenizer, BertModel, TFBertForSequenceClassification, TFBertModel

In [5]:
# Load pre-trained model tokenizer, to convert our text into tokens that correspond to BERT’s vocabulary.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:01<00:00, 188066.56B/s]


#### 找出單句最多token

In [6]:
# 找出最多text add aspect中最多是幾個token，不包含CLS跟SEP
def find_max_token(pd):
    max_token = 0
    for i in range(len(pd)):
        tokens_len = len(tokenizer.tokenize(pd.loc[i, 'text']))
        if tokens_len>max_token:
            max_token = tokens_len
    return max_token

In [7]:
# 找出text add aspect中token最多的是幾個token，不包含CLS跟SEP
train_max_token = find_max_token(train_data)
test_max_token = find_max_token(test_data)
print('訓練資料集token最多是:', train_max_token)
print('測試資料集token最多是:', test_max_token)

訓練資料集token最多是: 89
測試資料集token最多是: 88


### 把資料轉換成token(padding)

#### 把句子轉變成token(padding)的function

In [8]:
# 把維度固定在128維
input_dim = 128
def input_ids_all(text):     
    tokens = tokenizer.tokenize(text) # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens) # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids) # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids) # list 轉 numpy
    if len(input_ids) < input_dim:
        n = input_dim - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0)) # array右邊append n 個 0  補長度到512
    return input_ids

In [9]:
# 把text轉成token並存進datafram
train_data['input_ids'] = train_data['text'].apply(input_ids_all)
test_data['input_ids'] = test_data['text'].apply(input_ids_all)

In [10]:
test_data.head(3)

Unnamed: 0,text,aspect,polarity,label,input_ids
0,"Boot time is super fast, around anywhere from ...",Boot time,positive,2,"[101, 9573, 2051, 2003, 3565, 3435, 1010, 2105..."
1,tech support would not fix the problem unless ...,tech support,negative,0,"[101, 6627, 2490, 2052, 2025, 8081, 1996, 3291..."
2,Set up was easy.,Set up,positive,2,"[101, 2275, 2039, 2001, 3733, 1012, 102, 0, 0,..."


In [11]:
# 稽查input_ids維度
print(len(train_data['input_ids'][0]))
print(len(test_data['input_ids'][0]))
test_data.loc[2, 'input_ids']

128
128


array([ 101, 2275, 2039, 2001, 3733, 1012,  102,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [12]:
# 把train data的input_ids提出存進list
train_input_ids = list()
for i in range(len(train_data)):
    np_id = train_data.loc[i, 'input_ids']
    train_input_ids.append(np_id)
train_input_ids = np.array(train_input_ids)
print(train_input_ids.shape)
train_input_ids

(5915, 128)


array([[ 101, 1045, 3715, ...,    0,    0,    0],
       [ 101, 1045, 3715, ...,    0,    0,    0],
       [ 101, 1996, 6627, ...,    0,    0,    0],
       ...,
       [ 101, 2169, 2795, ...,    0,    0,    0],
       [ 101, 2169, 2795, ...,    0,    0,    0],
       [ 101, 2169, 2795, ...,    0,    0,    0]])

In [13]:
# 把test data的input_ids提出存進list
test_input_ids = list()
for i in range(len(test_data)):
    np_id = test_data.loc[i, 'input_ids']
    test_input_ids.append(np_id)
test_input_ids = np.array(test_input_ids)
print(test_input_ids.shape)
test_input_ids

(1758, 128)


array([[  101,  9573,  2051, ...,     0,     0,     0],
       [  101,  6627,  2490, ...,     0,     0,     0],
       [  101,  2275,  2039, ...,     0,     0,     0],
       ...,
       [  101, 24519, 10439, ...,     0,     0,     0],
       [  101, 24519, 10439, ...,     0,     0,     0],
       [  101, 24519, 10439, ...,     0,     0,     0]])

In [27]:
# 把train data laebel變成numpy
train_label = train_data['label'].to_numpy()
print(len(train_label))
train_label

5915


array([1, 2, 0, ..., 1, 1, 1])

In [28]:
# 把test data label變成numpy
test_label = test_data['label'].to_numpy()
print(len(test_label))
test_label

1758


array([2, 0, 2, ..., 2, 2, 2])

In [35]:
# 把train label one hot
train_label = pd.get_dummies(train_data['label']).to_numpy() # label轉乘2維矩陣   # keras不吃1維label
print(train_label.shape)
for i in range(10):
    print(train_data.loc[i, 'label'], train_label[i])

(5915, 3)
1 [0 1 0]
2 [0 0 1]
0 [1 0 0]
0 [1 0 0]
1 [0 1 0]
2 [0 0 1]
2 [0 0 1]
2 [0 0 1]
2 [0 0 1]
2 [0 0 1]


In [36]:
# 把test label one hot
test_label = pd.get_dummies(test_data['label']).to_numpy() # label轉乘2維矩陣   # keras不吃1維label
print(test_label.shape)
for i in range(10):
    print(test_data.loc[i, 'label'], test_label[i])

(1758, 3)
2 [0 0 1]
0 [1 0 0]
2 [0 0 1]
0 [1 0 0]
0 [1 0 0]
0 [1 0 0]
2 [0 0 1]
0 [1 0 0]
1 [0 1 0]
2 [0 0 1]


In [29]:
# 檢查polarity跟label有沒有不一樣
print('test_data')
for i in range(15):
    print(test_data.loc[i, 'polarity'], test_data.loc[i, 'label'], test_label[i])

test_data
positive 2 2
negative 0 0
positive 2 2
negative 0 0
negative 0 0
negative 0 0
positive 2 2
negative 0 0
neutral 1 1
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2


### Model

In [30]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, Dropout, Activation, Input, Flatten, InputLayer

In [25]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# num_labels=3 分3類
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

100%|██████████| 433/433 [00:00<00:00, 559671.38B/s]
100%|██████████| 536063208/536063208 [03:25<00:00, 2607385.09B/s]


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  2307      
Total params: 109,484,547
Trainable params: 109,484,547
Non-trainable params: 0
_________________________________________________________________


In [43]:
# TFBertModel
input_layer = Input(shape = (128,), dtype='int64')
bert = TFBertModel.from_pretrained('bert-base-uncased')(input_layer)
bert = bert[0]
dropout = Dropout(0.1)(bert)
flat = Flatten()(dropout)
classifier = Dense(units=3)(flat)
model = Model(inputs=input_layer, outputs=classifier)
model.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
tf_bert_model_4 (TFBertModel ((None, 128, 768), (None, 109482240 
_________________________________________________________________
dropout_227 (Dropout)        (None, 128, 768)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 98304)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 294915    
Total params: 109,777,155
Trainable params: 109,777,155
Non-trainable params: 0
_________________________________________________________________


In [44]:
model_fit = model.fit(train_input_ids, train_label, 
                      batch_size=4, epochs=4, 
                      validation_data=(test_input_ids, test_label))

Train on 5915 samples, validate on 1758 samples
Epoch 1/4
   4/5915 [..............................] - ETA: 1:07:14

ValueError: Can not squeeze dim[1], expected a dimension of 1, got 3 for 'metrics/accuracy/Squeeze' (op: 'Squeeze') with input shapes: [?,3].

### 看confusion matrix

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# test data confusion
predictions_test= model.predict(test_input_ids) # 輸出的是n*5的編碼值array
predictions_test = np.argmax(predictions_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(test_label, predictions_test))
print(confusion_matrix(test_label, predictions_test))
print(classification_report(test_label, predictions_test))

In [None]:
# laptop_test confusion
laptop_test_input_ids = test_input_ids[:638]
laptop_test_label = test_label[:638]
predictions_lap_test = model.predict(laptop_test_input_ids)
predictions_lap_test = np.argmax(predictions_lap_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(laptop_test_label, predictions_lap_test))
print(confusion_matrix(laptop_test_label, predictions_lap_test))
print(classification_report(laptop_test_label, predictions_lap_test))

In [None]:
# laptop_test confusion
restaurant_test_input_ids = test_input_ids[638:]
restaurant_test_label = test_label[638:]
predictions_res_test = model.predict(restaurant_test_input_ids)
predictions_res_test = np.argmax(predictions_res_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(restaurant_test_label, predictions_res_test))
print(confusion_matrix(restaurant_test_label, predictions_res_test))
print(classification_report(restaurant_test_label, predictions_res_test))