## 先跑LSTM抓取feature(把hidden layer抽出來)，再放到bert裡面去做分類，先restaurant再laptop

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

### 對處理好的laptop、restaurant的train、test資料作前處理

In [5]:
#把dataframe裡的text切成text左邊跟右邊並做一些處理的function
def split_text(df):
    df['left_text'] = 'N/A'
    df['right_text'] = 'N/A'
    
    for i in tqdm(range(len(df))):
        text = df.loc[i, 'text']
        aspect = df.loc[i, 'aspect']
        text_split = text.split(aspect) # 根據aspect切割text左右邊
        
        left_text = text_split[0]+aspect
        right_text = aspect+text_split[1]
        left_text = left_text.lower() # 把字串變成小寫
        right_text = right_text.lower()
        left_text = re.sub('-', ' ', left_text)
        right_text = re.sub('-', ' ', right_text)
        left_text = re.sub('[.,!"()#%&/:?~]', '', left_text) # 把字串中的一些符號刪除
        right_text = re.sub('[.,!"()#%&/:?~]', '', right_text)
        
        df.loc[i,'left_text'] = left_text
        df.loc[i,'right_text'] = right_text
        df.loc[i, 'left_right_text'] = left_text +' '+ right_text # 用來文字encoding
        
    return df

In [6]:
laptop_train = pd.read_csv('dataset/laptop_train_processed.csv', encoding='utf-8')
restaurant_train = pd.read_csv('dataset/restaurant_train_processed.csv', encoding='utf-8')
laptop_test = pd.read_csv('dataset/laptop_test_processed.csv', encoding='utf-8')
restaurant_test = pd.read_csv('dataset/restaurant_test_processed.csv', encoding='utf-8')

# 把train的資料串在一起
train_data = restaurant_train.append(laptop_train)
train_data = train_data.reset_index(drop=True)

#把test的資料串在一起
test_data = restaurant_test.append(laptop_test)
test_data = test_data.reset_index(drop=True)

#把train、test資料串在一起
data = train_data.append(test_data)
data = data.reset_index(drop=True)

# data切割text
data = split_text(data)

print('訓練資料集:', len(train_data))
print('測試資料集:', len(test_data))
print('所有資料集:', len(data))
data.head(10)

100%|██████████| 7673/7673 [00:02<00:00, 2889.61it/s]

訓練資料集: 5915
測試資料集: 1758
所有資料集: 7673





Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text
0,But the staff was so horrible to us.,staff,negative,but the staff,staff was so horrible to us,but the staff staff was so horrible to us
1,"To be completely fair, the only redeeming fact...",food,positive,to be completely fair the only redeeming facto...,food which was above average but couldn't make...,to be completely fair the only redeeming facto...
2,"The food is uniformly exceptional, with a very...",food,positive,the food,food is uniformly exceptional with a very capa...,the food food is uniformly exceptional with a ...
3,"The food is uniformly exceptional, with a very...",kitchen,positive,the food is uniformly exceptional with a very ...,kitchen which will proudly whip up whatever yo...,the food is uniformly exceptional with a very ...
4,"The food is uniformly exceptional, with a very...",menu,neutral,the food is uniformly exceptional with a very ...,menu or not,the food is uniformly exceptional with a very ...
5,"Not only was the food outstanding, but the lit...",food,positive,not only was the food,food outstanding but the little 'perks' were g...,not only was the food food outstanding but the...
6,"Not only was the food outstanding, but the lit...",perks,positive,not only was the food outstanding but the litt...,perks' were great,not only was the food outstanding but the litt...
7,Our agreed favorite is the orrechiete with sau...,orrechiete with sausage and chicken,positive,our agreed favorite is the orrechiete with sau...,orrechiete with sausage and chicken usually th...,our agreed favorite is the orrechiete with sau...
8,Our agreed favorite is the orrechiete with sau...,waiters,positive,our agreed favorite is the orrechiete with sau...,waiters are kind enough to split the dish in h...,our agreed favorite is the orrechiete with sau...
9,Our agreed favorite is the orrechiete with sau...,meats,neutral,our agreed favorite is the orrechiete with sau...,meats,our agreed favorite is the orrechiete with sau...


In [7]:
# print一個出來看看
n = 3
print(data.loc[n, 'text'])
print()
print(data.loc[n, 'left_text'])
print()
print(data.loc[n, 'right_text'])
print()
print(data.loc[n, 'left_right_text'])

The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.

the food is uniformly exceptional with a very capable kitchen

kitchen which will proudly whip up whatever you feel like eating whether it's on the menu or not

the food is uniformly exceptional with a very capable kitchen kitchen which will proudly whip up whatever you feel like eating whether it's on the menu or not


In [8]:
# 把文字Label變成數字label
data.loc[data['polarity'] == 'positive', 'label'] = 2
data.loc[data['polarity'] == 'neutral', 'label'] = 1
data.loc[data['polarity'] == 'negative', 'label'] = 0
data['label'] = data['label'].astype(int)

data.head(10)

Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text,label
0,But the staff was so horrible to us.,staff,negative,but the staff,staff was so horrible to us,but the staff staff was so horrible to us,0
1,"To be completely fair, the only redeeming fact...",food,positive,to be completely fair the only redeeming facto...,food which was above average but couldn't make...,to be completely fair the only redeeming facto...,2
2,"The food is uniformly exceptional, with a very...",food,positive,the food,food is uniformly exceptional with a very capa...,the food food is uniformly exceptional with a ...,2
3,"The food is uniformly exceptional, with a very...",kitchen,positive,the food is uniformly exceptional with a very ...,kitchen which will proudly whip up whatever yo...,the food is uniformly exceptional with a very ...,2
4,"The food is uniformly exceptional, with a very...",menu,neutral,the food is uniformly exceptional with a very ...,menu or not,the food is uniformly exceptional with a very ...,1
5,"Not only was the food outstanding, but the lit...",food,positive,not only was the food,food outstanding but the little 'perks' were g...,not only was the food food outstanding but the...,2
6,"Not only was the food outstanding, but the lit...",perks,positive,not only was the food outstanding but the litt...,perks' were great,not only was the food outstanding but the litt...,2
7,Our agreed favorite is the orrechiete with sau...,orrechiete with sausage and chicken,positive,our agreed favorite is the orrechiete with sau...,orrechiete with sausage and chicken usually th...,our agreed favorite is the orrechiete with sau...,2
8,Our agreed favorite is the orrechiete with sau...,waiters,positive,our agreed favorite is the orrechiete with sau...,waiters are kind enough to split the dish in h...,our agreed favorite is the orrechiete with sau...,2
9,Our agreed favorite is the orrechiete with sau...,meats,neutral,our agreed favorite is the orrechiete with sau...,meats,our agreed favorite is the orrechiete with sau...,1


In [9]:
#找出left_text跟right_text裡面最多是多少字
max_count = 0
for i in range(len(data)):
    left_text_word_count = len(data.loc[i,'left_text'].split())
    right_text_word_count = len(data.loc[i,'right_text'].split())
    big_count = max(left_text_word_count, right_text_word_count)
    if big_count>max_count:
        max_count = big_count
print('left_text與right_text最多的字數:', max_count)

left_text與right_text最多的字數: 72


### 對文字做encoding

In [10]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [11]:
max_words = 7000 # 最大的字數
max_seq_length = 80 # 句子最長長度
embedding_dim = 300 # 每個字維度

In [12]:
# 把字變成token
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(data['left_right_text'].to_numpy())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# word_index就是根據left_right_text內容彙整出來的切字跟代表那個字的token number (每個字的dict)

Found 6557 unique tokens.


In [13]:
# 檢查word_index(dictionary)裡面的東西，前面是字，後面是token
for x in list(word_index)[0:10]:
    print (x, ':', word_index[x])

the : 1
and : 2
a : 3
to : 4
is : 5
i : 6
of : 7
for : 8
food : 9
it : 10


In [14]:
# 檢查其中一項字串的token
n = 9 # index number
left_text = data['left_text'].to_numpy() # 轉成向量
right_text = data['right_text'].to_numpy()
left_text_seq = tokenizer.texts_to_sequences(left_text)
right_text_seq = tokenizer.texts_to_sequences(right_text)
print(data.loc[n, 'left_text'])
print(data.loc[n, 'right_text'])
print(left_text_seq[n])
print(right_text_seq[n])
print(type(right_text_seq))
# 把右邊的字串token倒過來，因為要從後面讀到前面
print('right text 倒過來')
for i in range(len(right_text_seq)):
    right_text_seq[i] = right_text_seq[i][::-1]
print(left_text_seq[n])
print(right_text_seq[n])
print(type(right_text_seq))

our agreed favorite is the orrechiete with sausage and chicken usually the waiters are kind enough to split the dish in half so you get to sample both meats
meats
[79, 1973, 545, 5, 1, 2531, 12, 1779, 2, 89, 448, 1, 367, 19, 492, 253, 4, 1278, 1, 151, 13, 420, 32, 16, 58, 4, 1974, 222, 1048]
[1048]
<class 'list'>
right text 倒過來
[79, 1973, 545, 5, 1, 2531, 12, 1779, 2, 89, 448, 1, 367, 19, 492, 253, 4, 1278, 1, 151, 13, 420, 32, 16, 58, 4, 1974, 222, 1048]
[1048]
<class 'list'>


In [15]:
# token sequence 後面補0的方法
def text_seq_padding(text_seq):
    if len(text_seq) < max_seq_length:
        n = max_seq_length - len(text_seq)
        text_seq = np.pad(text_seq, (0, n), mode ='constant', constant_values=(0)) # array右邊append n 個 0
    return text_seq
# 把每個left_text_seq，right_text_seq padding到同樣的長度 (後面補0)
left_text_seq = [text_seq_padding(i) for i in left_text_seq] # 必須要 [ ] 輸出是list
left_text_seq = np.array(left_text_seq)

right_text_seq = [text_seq_padding(i) for i in right_text_seq]
right_text_seq = np.array(right_text_seq)

# n = 0 # index number
print(left_text_seq[n])
print(right_text_seq[n])

[  79 1973  545    5    1 2531   12 1779    2   89  448    1  367   19
  492  253    4 1278    1  151   13  420   32   16   58    4 1974  222
 1048    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]
[1048    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


### 使用預先處理的詞向量 (crawl 300 dim)
#### https://fasttext.cc/docs/en/english-vectors.html

In [16]:
# 把embedding_matrix load 近來
embedding_matrix = np.load('dataset/embedding_matrix.npy')
print(type(embedding_matrix))
print(embedding_matrix.shape)
print(embedding_matrix)

<class 'numpy.ndarray'>
(6558, 300)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.0231      0.017       0.0157     ...  0.0744     -0.1118
   0.0963    ]
 [-0.1081      0.0191      0.0354     ...  0.1104      0.0475
  -0.0599    ]
 ...
 [ 0.16580001 -0.0169     -0.4138     ...  0.0933     -0.1168
  -0.1777    ]
 [-0.1179      0.0726     -0.005      ...  0.2079      0.0322
  -0.26879999]
 [ 0.24439999  0.1206      0.1123     ... -0.147      -0.0186
  -0.3204    ]]


### 確認資料、並切割成train、test

In [18]:
# 稽查dataframe、token sequence裡面laptop_test、restaurant_test資料是否一致
# laptop_test第一筆在5915；restaurant_test第一筆在6553
print(data.loc[5915, 'left_text'])
print(data.loc[5915, 'right_text'])
print(left_text_seq[5915])
print(right_text_seq[5915])
print()
print(data.loc[7035, 'left_text'])
print(data.loc[7035, 'right_text'])
print(left_text_seq[7035])
print(right_text_seq[7035])

the bread
bread is top notch as well
[  1 305   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[  71   30 1074  358    5  305    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]

boot time
boot time is super fast around anywhere from 35 seconds to 1 minute
[506  98   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0 

In [19]:
# 把label轉成numpy
Y = data['label'].to_numpy()
print('Shape of Y:', Y.shape)
for i in range(10):
    print(data.loc[i, 'label'], Y[i])
#[1 0 0] = negative
#[0 1 0] = neutral
#[0 0 1] = positve

Shape of Y: (7673,)
0 0
2 2
2 2
2 2
1 1
2 2
2 2
2 2
2 2
1 1


In [20]:
#把資料切割成train、test
X_left_train = left_text_seq[:5915]
X_right_train = right_text_seq[:5915]
Y_train = Y[:5915]
X_left_test = left_text_seq[5915:]
X_right_test = right_text_seq[5915:]
Y_test = Y[5915:]
print(len(X_left_train), len(X_right_train), len(Y_train))
print(len(X_left_test), len(X_right_test), len(Y_test))

5915 5915 5915
1758 1758 1758


In [25]:
# 檢查polarity跟label有沒有不一樣
print('restaurant_test', '   ','laptop_test')
for i in range(20):
    print(laptop_test.loc[i, 'polarity'], data.loc[5915+i, 'label'], Y_test[i], '      ', restaurant_test.loc[i, 'polarity'], data.loc[7035+i, 'label'], Y_test[1120+i])

restaurant_test     laptop_test
positive 2 2        positive 2 2
negative 2 2        positive 0 0
positive 2 2        positive 2 2
negative 2 2        positive 0 0
negative 2 2        positive 0 0
negative 2 2        positive 0 0
positive 2 2        positive 2 2
negative 2 2        positive 0 0
neutral 2 2        positive 1 1
positive 2 2        positive 2 2
positive 1 1        neutral 2 2
positive 2 2        positive 2 2
positive 2 2        positive 2 2
positive 2 2        positive 2 2
positive 0 0        negative 2 2
positive 2 2        positive 2 2
negative 1 1        neutral 0 0
negative 1 1        neutral 0 0
positive 2 2        positive 2 2
positive 2 2        positive 2 2


## Model

In [26]:
import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Flatten, InputLayer, Bidirectional, concatenate, add, average, Reshape
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model

### 把兩邊input concate起來，有加上dropout的模型

In [39]:
# first input model 1
input_layer_1 = Input(shape = (max_seq_length,), dtype='int64')
embedding_1 = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], mask_zero=True, trainable=True)(input_layer_1)
lstm_hidden_1 = LSTM(512, return_sequences=False, dropout=0.3)(embedding_1) 
# lstm_hidden_1 = Bidirectional(LSTM(512, return_sequences=False, dropout=0.4))(embedding_1) 

#second input model 2
input_layer_2 = Input(shape = (max_seq_length,), dtype='int64')
embedding_2 = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], mask_zero=True, trainable=True)(input_layer_2)
lstm_hidden_2 = LSTM(512, return_sequences=False, dropout=0.3)(embedding_2)
# lstm_hidden_2 = Bidirectional(LSTM(512, return_sequences=False, dropout=0.4))(embedding_2)

#merge input model
averaged = concatenate([lstm_hidden_1, lstm_hidden_2])
hidden_1 = Dense(128, activation='relu')(averaged)
dropout_1 = Dropout(0.3)(hidden_1)
hidden_2 = Dense(64, activation='relu')(dropout_1)
dropout_2 = Dropout(0.3)(hidden_2)
output = Dense(3, activation='softmax')(dropout_2)
model1 = Model(inputs=[input_layer_1, input_layer_2], outputs=output)
print(model1.summary())
adam = Adam(lr=1e-2)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model1.compile(loss=loss, optimizer=optimizer, metrics=[metric])

early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=1, restore_best_weights=True)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 80, 300)      1967400     input_7[0][0]                    
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 80, 300)      1967400     input_8[0][0]                    
____________________________________________________________________________________________

In [40]:
model1_fit = model1.fit([X_left_train, X_right_train],Y_train, batch_size=64,epochs=30,
                      validation_data=([X_left_test, X_right_test],Y_test), callbacks=[early_stopping])

Train on 5915 samples, validate on 1758 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 00020: early stopping


### 看confusion matrix

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [42]:
# 取所有label中的test label
Y_label = data['label'].to_numpy()[5915:]

# test data confusion matrix
predictions= model1.predict([X_left_test, X_right_test]) # 輸出的是n*5的編碼值array
print('三元分類還沒argmax output')
print(predictions)
print()
predictions = np.argmax(predictions, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(Y_label, predictions))
print(confusion_matrix(Y_label, predictions))
print(classification_report(Y_label, predictions))


三元分類還沒argmax output
[[5.3300154e-13 2.7310196e-11 1.0000000e+00]
 [9.9999702e-01 2.6899255e-07 2.7781507e-06]
 [3.6737475e-15 9.1933655e-13 1.0000000e+00]
 ...
 [1.0000000e+00 2.8581860e-13 2.1993028e-13]
 [3.3382308e-03 9.9622923e-01 4.3247713e-04]
 [3.3532366e-02 9.9948526e-04 9.6546817e-01]]

0.7047781569965871
[[185  32 107]
 [ 94 118 153]
 [ 66  67 936]]
              precision    recall  f1-score   support

           0       0.54      0.57      0.55       324
           1       0.54      0.32      0.41       365
           2       0.78      0.88      0.83      1069

    accuracy                           0.70      1758
   macro avg       0.62      0.59      0.60      1758
weighted avg       0.69      0.70      0.69      1758



In [43]:
# 取所有label中的laptop test lable
laptop_label = data['label'].to_numpy()[5915:7035]

# laptop test data confusion matrix
predictions= model1.predict([X_left_test[:1120], X_right_test[:1120]]) # 輸出的是n*5的編碼值array
predictions = np.argmax(predictions, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(laptop_label, predictions))
print(confusion_matrix(laptop_label, predictions))
print(classification_report(laptop_label, predictions))

0.7383928571428572
[[107  17  72]
 [ 39  61  96]
 [ 35  34 659]]
              precision    recall  f1-score   support

           0       0.59      0.55      0.57       196
           1       0.54      0.31      0.40       196
           2       0.80      0.91      0.85       728

    accuracy                           0.74      1120
   macro avg       0.64      0.59      0.60      1120
weighted avg       0.72      0.74      0.72      1120



In [44]:
# 取所有label中的restaurant test lable
restaurant_label = data['label'].to_numpy()[7035:]

# restaurant test data confusion matrix
predictions= model1.predict([X_left_test[1120:], X_right_test[1120:]]) # 輸出的是n*5的編碼值array
predictions = np.argmax(predictions, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(restaurant_label, predictions))
print(confusion_matrix(restaurant_label, predictions))
print(classification_report(restaurant_label, predictions))

0.64576802507837
[[ 78  15  35]
 [ 55  57  57]
 [ 31  33 277]]
              precision    recall  f1-score   support

           0       0.48      0.61      0.53       128
           1       0.54      0.34      0.42       169
           2       0.75      0.81      0.78       341

    accuracy                           0.65       638
   macro avg       0.59      0.59      0.58       638
weighted avg       0.64      0.65      0.63       638



### get train、test LSTM 64 dimension output

In [45]:
layer_output=model1.get_layer('dense_9').output
intermediate_model = Model(inputs=[input_layer_1, input_layer_2],outputs=layer_output)
intermediate_prediction=intermediate_model.predict([left_text_seq, right_text_seq])

In [46]:
print(type(intermediate_prediction))
print(intermediate_prediction.shape)
print(intermediate_prediction)

<class 'numpy.ndarray'>
(7673, 128)
[[0.         0.         0.         ... 0.         1.409985   2.460079  ]
 [0.42525083 0.17294298 1.302797   ... 0.         0.         0.        ]
 [0.4482868  0.         1.1084592  ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         1.712281   2.9153383 ]
 [0.6232571  0.8368558  0.         ... 0.22482945 0.         0.2144836 ]
 [0.         0.         0.0696213  ... 0.         0.6006517  0.07825752]]


### 把LSTM dimension 放進dataframe

In [47]:
# 把LSTM predict出來的東西放進dataframe
data['lstm_predict'] = 'N/A'
for i in range(len(data)):
    data['lstm_predict'][i] = intermediate_prediction[i]
data.tail(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text,label,lstm_predict
7663,also...- excellent operating system- size and ...,weight,positive,also excellent operating system size and weight,weight for optimal mobility excellent durabil...,also excellent operating system size and wei...,2,"[1.442924, 0.118619904, 1.0683073, 1.8290948, ..."
7664,also...- excellent operating system- size and ...,mobility,positive,also excellent operating system size and wei...,mobility excellent durability of the battery ...,also excellent operating system size and wei...,2,"[0.64442265, 0.4648288, 0.8578217, 1.8430195, ..."
7665,also...- excellent operating system- size and ...,durability of the battery,positive,also excellent operating system size and wei...,durability of the battery the functions provi...,also excellent operating system size and wei...,2,"[1.1156328, 0.0, 1.1949103, 1.7904588, 0.56774..."
7666,also...- excellent operating system- size and ...,functions provided by the trackpad,positive,also excellent operating system size and wei...,functions provided by the trackpad is unmatche...,also excellent operating system size and wei...,2,"[1.6156456, 0.0, 1.5932965, 2.4519243, 0.50335..."
7667,This hardware seems to be better than the iMac...,hardware,positive,this hardware,hardware seems to be better than the imac in t...,this hardware hardware seems to be better than...,2,"[0.26618752, 0.0, 0.0, 0.0, 0.0, 0.0, 0.803694..."
7668,I've had it for about 2 months now and found n...,software,neutral,i've had it for about 2 months now and found n...,software or updates,i've had it for about 2 months now and found n...,1,"[0.0, 0.515463, 0.0, 0.0, 0.0, 0.0, 0.7722197,..."
7669,I've had it for about 2 months now and found n...,updates,neutral,i've had it for about 2 months now and found n...,updates,i've had it for about 2 months now and found n...,1,"[0.0, 0.11405362, 0.0, 0.0, 0.0, 0.0, 0.975277..."
7670,the latest version does not have a disc drive.,disc drive,neutral,the latest version does not have a disc drive,disc drive,the latest version does not have a disc drive ...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.2103522, 0.0,..."
7671,Screen - although some people might complain a...,Screen,positive,screen,screen although some people might complain a...,screen screen although some people might com...,2,"[0.6232571, 0.8368558, 0.0, 0.0, 0.29151317, 0..."
7672,Screen - although some people might complain a...,res,positive,screen although some people might complain a...,res which i think is ridiculous,screen although some people might complain a...,2,"[0.0, 0.0, 0.0696213, 0.177733, 0.0, 0.6158762..."


In [48]:
# 稽查
n = 504
print(data.loc[n, 'lstm_predict'].shape)
print(len(data.loc[n, 'lstm_predict']))
print(data.loc[n, 'lstm_predict'])

(128,)
128
[0.44748157 0.         0.8662199  1.6753371  0.         1.5549994
 0.         1.3884728  0.         0.         0.         0.8880838
 0.         0.9054352  0.         0.10345642 0.         0.18495762
 0.24724263 0.16130812 0.         2.325821   1.718151   1.7964698
 0.         0.2915889  0.         1.7786036  0.525636   0.
 0.8316277  1.1500238  0.0913147  0.         0.         0.
 0.         0.         0.         1.3722969  0.81445724 0.
 0.         0.         0.         0.6655453  1.3237424  0.
 0.         1.766793   0.00810153 0.         0.         0.
 0.0910603  0.         0.         0.         0.         1.6202874
 0.         0.         0.         1.0358601  0.         0.2098353
 0.         0.         0.         0.         0.07523081 0.
 0.         0.27667302 0.         0.         1.3327559  1.1369873
 0.         0.         1.4422199  0.         0.         0.9922794
 1.2358521  0.         0.         1.7379907  0.6183897  0.46170944
 1.5591501  0.         0.         0.   

### Bert前處理

In [49]:
from transformers import BertTokenizer, BertModel, TFBertForSequenceClassification, TFBertModel

In [50]:
# Load pre-trained model tokenizer, to convert our text into tokens that correspond to BERT’s vocabulary.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### 找出單句最多token

In [51]:
# 找出最多text add aspect中最多是幾個token，不包含CLS跟SEP
def find_max_token(pd):
    max_token = 0
    index = 0
    for i in range(len(pd)):
        text = pd.loc[i, 'text']
        aspect = pd.loc[i, 'aspect']
        text_aspect = text + ' ' +aspect
        tokens_len = len(tokenizer.tokenize(text_aspect))
        if tokens_len>max_token:
            max_token = tokens_len
            index = i
    return [max_token, index]

In [52]:
# 找出text add aspect中token最多的是幾個token，不包含CLS跟SEP
max_token = find_max_token(data)
print('資料集token最多與index是:', max_token)

資料集token最多與index是: [99, 7419]


### 正式把資料轉換成token(padding)

#### 把句子轉變成token(CLS+text+SEP+asepct)+(padding)的function

In [53]:
# 把維度固定在128維
input_dim = 128
def input_ids_all(pd):
    pd['input_ids'] = 'N/A'
    for i in range(len(pd)):
        text = pd.loc[i, 'text']
        aspect = pd.loc[i, 'aspect']
        text_tokens = tokenizer.tokenize(text) # 把text轉成token
        aspect_tokens = tokenizer.tokenize(aspect) # 把aspect轉成token
        
        text_input_ids = tokenizer.convert_tokens_to_ids(text_tokens) # 把text token轉成text token id
        aspect_input_ids = tokenizer.convert_tokens_to_ids(aspect_tokens) # 把aspect token轉成aspect token id
        
        text_input_ids_cls = tokenizer.build_inputs_with_special_tokens(text_input_ids) # aspect token id加上CLS、SEP token id
        input_ids = text_input_ids_cls + aspect_input_ids # 把aspect token id接在text token id 後面 (CLS+text+SEP+aspect)
        input_ids = np.array(input_ids)
        
        if len(input_ids) < input_dim:
            n = input_dim - len(input_ids)
            input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0)) # array右邊append n 個 0  補長度到512
        
        pd['input_ids'][i] = input_ids
    return pd

In [54]:
# 將text轉成token，後面加上aspect token存進dataframe
data = input_ids_all(data)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text,label,lstm_predict,input_ids
0,But the staff was so horrible to us.,staff,negative,but the staff,staff was so horrible to us,but the staff staff was so horrible to us,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.3246815, 0.0,...","[101, 2021, 1996, 3095, 2001, 2061, 9202, 2000..."
1,"To be completely fair, the only redeeming fact...",food,positive,to be completely fair the only redeeming facto...,food which was above average but couldn't make...,to be completely fair the only redeeming facto...,2,"[0.42525083, 0.17294298, 1.302797, 2.6707382, ...","[101, 2000, 2022, 3294, 4189, 1010, 1996, 2069..."
2,"The food is uniformly exceptional, with a very...",food,positive,the food,food is uniformly exceptional with a very capa...,the food food is uniformly exceptional with a ...,2,"[0.4482868, 0.0, 1.1084592, 2.1217976, 0.08224...","[101, 1996, 2833, 2003, 27423, 11813, 1010, 20..."
3,"The food is uniformly exceptional, with a very...",kitchen,positive,the food is uniformly exceptional with a very ...,kitchen which will proudly whip up whatever yo...,the food is uniformly exceptional with a very ...,2,"[0.654872, 0.0, 1.642791, 2.8826728, 0.0226560...","[101, 1996, 2833, 2003, 27423, 11813, 1010, 20..."
4,"The food is uniformly exceptional, with a very...",menu,neutral,the food is uniformly exceptional with a very ...,menu or not,the food is uniformly exceptional with a very ...,1,"[1.0776782, 1.8292367, 0.0, 0.0, 1.2995703, 0....","[101, 1996, 2833, 2003, 27423, 11813, 1010, 20..."


In [55]:
# 稽查
n = 6299
print(data.loc[n, 'text'])
print(data.loc[n, 'aspect'])
print(data.loc[n, 'input_ids'])

The skillfully chosen Portuguese cheese cart paired with quality port provides the perfect Iberian ending.
port
[  101  1996  8066  7699  4217  5077  8808 11122 12739  2007  3737  3417
  3640  1996  3819 21988  4566  1012   102  3417     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [56]:
# 把data的input_ids提出存進list
input_ids = list()
for i in range(len(data)):
    np_id = data.loc[i, 'input_ids']
    input_ids.append(np_id)
input_ids = np.array(input_ids)
print(input_ids.shape)
input_ids

(7673, 128)


array([[ 101, 2021, 1996, ...,    0,    0,    0],
       [ 101, 2000, 2022, ...,    0,    0,    0],
       [ 101, 1996, 2833, ...,    0,    0,    0],
       ...,
       [ 101, 1996, 6745, ...,    0,    0,    0],
       [ 101, 3898, 1011, ...,    0,    0,    0],
       [ 101, 3898, 1011, ...,    0,    0,    0]])

In [57]:
# 把data的lstm_predcit提出存進list
lstm_predict = list()
for i in range(len(data)):
    np_lstm = data.loc[i, 'lstm_predict']
    lstm_predict.append(np_lstm)
lstm_predict = np.array(lstm_predict)
print(type(lstm_predict))
print(lstm_predict.shape)
print(lstm_predict)

<class 'numpy.ndarray'>
(7673, 128)
[[0.         0.         0.         ... 0.         1.409985   2.460079  ]
 [0.42525083 0.17294298 1.302797   ... 0.         0.         0.        ]
 [0.4482868  0.         1.1084592  ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         1.712281   2.9153383 ]
 [0.6232571  0.8368558  0.         ... 0.22482945 0.         0.2144836 ]
 [0.         0.         0.0696213  ... 0.         0.6006517  0.07825752]]


In [58]:
# 把data label變成numpy
label = data['label'].to_numpy()
print(len(label))
label

7673


array([0, 2, 2, ..., 1, 2, 2])

### 切train、test資料

In [59]:
# X
train_input_ids = input_ids[:5915]
test_input_ids = input_ids[5915:]
print(train_input_ids.shape)
print(train_input_ids)
print()
print(test_input_ids.shape)
print(test_input_ids)

(5915, 128)
[[ 101 2021 1996 ...    0    0    0]
 [ 101 2000 2022 ...    0    0    0]
 [ 101 1996 2833 ...    0    0    0]
 ...
 [ 101 2057 2036 ...    0    0    0]
 [ 101 2129 2000 ...    0    0    0]
 [ 101 1045 2052 ...    0    0    0]]

(1758, 128)
[[ 101 1996 7852 ...    0    0    0]
 [ 101 1045 2031 ...    0    0    0]
 [ 101 2833 2003 ...    0    0    0]
 ...
 [ 101 1996 6745 ...    0    0    0]
 [ 101 3898 1011 ...    0    0    0]
 [ 101 3898 1011 ...    0    0    0]]


In [60]:
# lstm predict
train_lstm_predict = lstm_predict[:5915]
test_lstm_predict = lstm_predict[5915:]
print(train_lstm_predict.shape)
print(test_lstm_predict.shape)

(5915, 128)
(1758, 128)


In [61]:
# Y
train_label = label[:5915]
test_label = label[5915:]
print(train_label.shape)
print(train_label)
print()
print(test_label.shape)
print(test_label)

(5915,)
[0 2 2 ... 1 2 1]

(1758,)
[2 2 2 ... 1 2 2]


In [62]:
# 檢查polarity跟label有沒有不一樣
print('test_data')
for i in range(15):
    print(test_data.loc[i, 'polarity'], data.loc[5915+i, 'label'], test_label[i])

test_data
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
neutral 1 1
positive 2 2
positive 2 2
positive 2 2
negative 0 0


### Model

In [63]:
input_layer= Input(shape = (128,), dtype='int64')
lstm_input_layer = Input(shape = (128,), dtype='float32')
print(type(input_layer))
print(type(lstm_input_layer))
bert = TFBertModel.from_pretrained('bert-base-uncased')(input_layer)
bert = bert[0]
dropout = Dropout(0.1)(bert)
flat = Flatten()(dropout)
dense_1 = Dense(units=512)(flat)
print(type(dense_1))

merge = concatenate([dense_1, lstm_input_layer])
classifier = Dense(units=3)(merge) # 分3類
model = Model(inputs=[input_layer, lstm_input_layer], outputs=classifier)
model.summary()


optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 109482240   input_9[0][0]                    
__________________________________________________________________________________________________
dropout_45 (Dropout)            (None, 128, 768)     0           tf_bert_model[0][0]              
__________________________________________________________________________________________________
flatten (Flatten)               (None, 98304

In [64]:
# # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# # num_labels=3 分3類
# model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# model.summary()

# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [65]:
model_fit = model.fit([train_input_ids,train_lstm_predict], train_label, 
                      batch_size=4, epochs=5, 
                      validation_data=([test_input_ids, test_lstm_predict], test_label), callbacks=[early_stopping])

Train on 5915 samples, validate on 1758 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [66]:
# test data confusion
predictions_test= model.predict([test_input_ids,test_lstm_predict]) # 輸出的是n*5的編碼值array
predictions_test = np.argmax(predictions_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(test_label, predictions_test))
print(confusion_matrix(test_label, predictions_test))
print(classification_report(test_label, predictions_test))

0.8054607508532423
[[208  81  35]
 [ 47 257  61]
 [ 27  91 951]]
              precision    recall  f1-score   support

           0       0.74      0.64      0.69       324
           1       0.60      0.70      0.65       365
           2       0.91      0.89      0.90      1069

    accuracy                           0.81      1758
   macro avg       0.75      0.75      0.74      1758
weighted avg       0.81      0.81      0.81      1758



In [67]:
# restaurant_test confusion
laptop_test_input_ids = test_input_ids[:1120]
laptop_test_lstm_predict = test_lstm_predict[:1120]
laptop_test_label = test_label[:1120]
predictions_lap_test = model.predict([laptop_test_input_ids, laptop_test_lstm_predict])
predictions_lap_test = np.argmax(predictions_lap_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(laptop_test_label, predictions_lap_test))
print(confusion_matrix(laptop_test_label, predictions_lap_test))
print(classification_report(laptop_test_label, predictions_lap_test))

0.8196428571428571
[[121  51  24]
 [ 19 137  40]
 [ 14  54 660]]
              precision    recall  f1-score   support

           0       0.79      0.62      0.69       196
           1       0.57      0.70      0.63       196
           2       0.91      0.91      0.91       728

    accuracy                           0.82      1120
   macro avg       0.75      0.74      0.74      1120
weighted avg       0.83      0.82      0.82      1120



In [68]:
# laptop_test confusion
restaurant_test_input_ids = test_input_ids[1120:]
restaurant_test_label = test_label[1120:]
restaurant_test_lstm_predict = test_lstm_predict[1120:]
predictions_res_test = model.predict([restaurant_test_input_ids, restaurant_test_lstm_predict])
predictions_res_test = np.argmax(predictions_res_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(restaurant_test_label, predictions_res_test))
print(confusion_matrix(restaurant_test_label, predictions_res_test))
print(classification_report(restaurant_test_label, predictions_res_test))

0.780564263322884
[[ 87  30  11]
 [ 28 120  21]
 [ 13  37 291]]
              precision    recall  f1-score   support

           0       0.68      0.68      0.68       128
           1       0.64      0.71      0.67       169
           2       0.90      0.85      0.88       341

    accuracy                           0.78       638
   macro avg       0.74      0.75      0.74       638
weighted avg       0.79      0.78      0.78       638

