## 先跑LSTM抓取feature(把hidden layer抽出來)，再放到bert裡面去做分類

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

### 對處理好的laptop、restaurant的train、test資料作前處理

In [2]:
#把dataframe裡的text切成text左邊跟右邊並做一些處理的function
def split_text(df):
    df['left_text'] = 'N/A'
    df['right_text'] = 'N/A'
    
    for i in tqdm(range(len(df))):
        text = df.loc[i, 'text']
        aspect = df.loc[i, 'aspect']
        text_split = text.split(aspect) # 根據aspect切割text左右邊
        
        left_text = text_split[0]+aspect
        right_text = aspect+text_split[1]
        left_text = left_text.lower() # 把字串變成小寫
        right_text = right_text.lower()
        left_text = re.sub('-', ' ', left_text)
        right_text = re.sub('-', ' ', right_text)
        left_text = re.sub('[.,!"()#%&/:?~]', '', left_text) # 把字串中的一些符號刪除
        right_text = re.sub('[.,!"()#%&/:?~]', '', right_text)
        
        df.loc[i,'left_text'] = left_text
        df.loc[i,'right_text'] = right_text
        df.loc[i, 'left_right_text'] = left_text +' '+ right_text # 用來文字encoding
        
    return df

In [3]:
laptop_train = pd.read_csv('dataset/laptop_train_processed.csv', encoding='utf-8')
restaurant_train = pd.read_csv('dataset/restaurant_train_processed.csv', encoding='utf-8')
laptop_test = pd.read_csv('dataset/laptop_test_processed.csv', encoding='utf-8')
restaurant_test = pd.read_csv('dataset/restaurant_test_processed.csv', encoding='utf-8')

# 把train的資料串在一起
train_data = laptop_train.append(restaurant_train)
train_data = train_data.reset_index(drop=True)

#把test的資料串在一起
test_data = laptop_test.append(restaurant_test)
test_data = test_data.reset_index(drop=True)

#把train、test資料串在一起
data = train_data.append(test_data)
data = data.reset_index(drop=True)

# data切割text
data = split_text(data)

print('訓練資料集:', len(train_data))
print('測試資料集:', len(test_data))
print('所有資料集:', len(data))
data.head(10)

100%|██████████| 7673/7673 [00:02<00:00, 2903.25it/s]

訓練資料集: 5915
測試資料集: 1758
所有資料集: 7673





Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text
0,I charge it at night and skip taking the cord ...,cord,neutral,i charge it at night and skip taking the cord,cord with me because of the good battery life,i charge it at night and skip taking the cord ...
1,I charge it at night and skip taking the cord ...,battery life,positive,i charge it at night and skip taking the cord ...,battery life,i charge it at night and skip taking the cord ...
2,The tech guy then said the service center does...,service center,negative,the tech guy then said the service center,service center does not do 1 to 1 exchange and...,the tech guy then said the service center serv...
3,The tech guy then said the service center does...,"""sales"" team",negative,the tech guy then said the service center does...,sales team which is the retail shop which i bo...,the tech guy then said the service center does...
4,The tech guy then said the service center does...,tech guy,neutral,the tech guy,tech guy then said the service center does not...,the tech guy tech guy then said the service ce...
5,"it is of high quality, has a killer GUI, is ex...",quality,positive,it is of high quality,quality has a killer gui is extremely stable i...,it is of high quality quality has a killer gui...
6,"it is of high quality, has a killer GUI, is ex...",GUI,positive,it is of high quality has a killer gui,gui is extremely stable is highly expandable i...,it is of high quality has a killer gui gui is ...
7,"it is of high quality, has a killer GUI, is ex...",applications,positive,it is of high quality has a killer gui is extr...,applications is easy to use and is absolutely ...,it is of high quality has a killer gui is extr...
8,"it is of high quality, has a killer GUI, is ex...",use,positive,it is of high quality has a killer gui is extr...,use and is absolutely gorgeous,it is of high quality has a killer gui is extr...
9,Easy to start up and does not overheat as much...,start up,positive,easy to start up,start up and does not overheat as much as othe...,easy to start up start up and does not overhea...


In [4]:
# print一個出來看看
n = 3
print(data.loc[n, 'text'])
print()
print(data.loc[n, 'left_text'])
print()
print(data.loc[n, 'right_text'])
print()
print(data.loc[n, 'left_right_text'])

The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my netbook from.

the tech guy then said the service center does not do 1 to 1 exchange and i have to direct my concern to the sales team

sales team which is the retail shop which i bought my netbook from

the tech guy then said the service center does not do 1 to 1 exchange and i have to direct my concern to the sales team sales team which is the retail shop which i bought my netbook from


In [5]:
# 把文字Label變成數字label
data.loc[data['polarity'] == 'positive', 'label'] = 2
data.loc[data['polarity'] == 'neutral', 'label'] = 1
data.loc[data['polarity'] == 'negative', 'label'] = 0
data['label'] = data['label'].astype(int)

data.head(10)

Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text,label
0,I charge it at night and skip taking the cord ...,cord,neutral,i charge it at night and skip taking the cord,cord with me because of the good battery life,i charge it at night and skip taking the cord ...,1
1,I charge it at night and skip taking the cord ...,battery life,positive,i charge it at night and skip taking the cord ...,battery life,i charge it at night and skip taking the cord ...,2
2,The tech guy then said the service center does...,service center,negative,the tech guy then said the service center,service center does not do 1 to 1 exchange and...,the tech guy then said the service center serv...,0
3,The tech guy then said the service center does...,"""sales"" team",negative,the tech guy then said the service center does...,sales team which is the retail shop which i bo...,the tech guy then said the service center does...,0
4,The tech guy then said the service center does...,tech guy,neutral,the tech guy,tech guy then said the service center does not...,the tech guy tech guy then said the service ce...,1
5,"it is of high quality, has a killer GUI, is ex...",quality,positive,it is of high quality,quality has a killer gui is extremely stable i...,it is of high quality quality has a killer gui...,2
6,"it is of high quality, has a killer GUI, is ex...",GUI,positive,it is of high quality has a killer gui,gui is extremely stable is highly expandable i...,it is of high quality has a killer gui gui is ...,2
7,"it is of high quality, has a killer GUI, is ex...",applications,positive,it is of high quality has a killer gui is extr...,applications is easy to use and is absolutely ...,it is of high quality has a killer gui is extr...,2
8,"it is of high quality, has a killer GUI, is ex...",use,positive,it is of high quality has a killer gui is extr...,use and is absolutely gorgeous,it is of high quality has a killer gui is extr...,2
9,Easy to start up and does not overheat as much...,start up,positive,easy to start up,start up and does not overheat as much as othe...,easy to start up start up and does not overhea...,2


In [6]:
#找出left_text跟right_text裡面最多是多少字
max_count = 0
for i in range(len(data)):
    left_text_word_count = len(data.loc[i,'left_text'].split())
    right_text_word_count = len(data.loc[i,'right_text'].split())
    big_count = max(left_text_word_count, right_text_word_count)
    if big_count>max_count:
        max_count = big_count
print('left_text與right_text最多的字數:', max_count)

left_text與right_text最多的字數: 72


### 對文字做encoding

In [7]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [8]:
max_words = 7000 # 最大的字數
max_seq_length = 80 # 句子最長長度
embedding_dim = 300 # 每個字維度

In [9]:
# 把字變成token
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(data['left_right_text'].to_numpy())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# word_index就是根據left_right_text內容彙整出來的切字跟代表那個字的token number (每個字的dict)

Found 6557 unique tokens.


In [10]:
# 檢查word_index(dictionary)裡面的東西，前面是字，後面是token
for x in list(word_index)[0:10]:
    print (x, ':', word_index[x])

the : 1
and : 2
a : 3
to : 4
is : 5
i : 6
of : 7
for : 8
food : 9
it : 10


In [11]:
# 檢查其中一項字串的token
n = 9 # index number
left_text = data['left_text'].to_numpy() # 轉成向量
right_text = data['right_text'].to_numpy()
left_text_seq = tokenizer.texts_to_sequences(left_text)
right_text_seq = tokenizer.texts_to_sequences(right_text)
print(data.loc[n, 'left_text'])
print(data.loc[n, 'right_text'])
print(left_text_seq[n])
print(right_text_seq[n])
print(type(right_text_seq))
# 把右邊的字串token倒過來，因為要從後面讀到前面
print('right text 倒過來')
for i in range(len(right_text_seq)):
    right_text_seq[i] = right_text_seq[i][::-1]
print(left_text_seq[n])
print(right_text_seq[n])
print(type(right_text_seq))

easy to start up
start up and does not overheat as much as other laptops
[119, 4, 588, 52]
[588, 52, 2, 213, 22, 5291, 30, 125, 30, 86, 509]
<class 'list'>
right text 倒過來
[119, 4, 588, 52]
[509, 86, 30, 125, 30, 5291, 22, 213, 2, 52, 588]
<class 'list'>


In [12]:
# token sequence 後面補0的方法
def text_seq_padding(text_seq):
    if len(text_seq) < max_seq_length:
        n = max_seq_length - len(text_seq)
        text_seq = np.pad(text_seq, (0, n), mode ='constant', constant_values=(0)) # array右邊append n 個 0
    return text_seq
# 把每個left_text_seq，right_text_seq padding到同樣的長度 (後面補0)
left_text_seq = [text_seq_padding(i) for i in left_text_seq] # 必須要 [ ] 輸出是list
left_text_seq = np.array(left_text_seq)

right_text_seq = [text_seq_padding(i) for i in right_text_seq]
right_text_seq = np.array(right_text_seq)

# n = 0 # index number
print(left_text_seq[n])
print(right_text_seq[n])

[119   4 588  52   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[ 509   86   30  125   30 5291   22  213    2   52  588    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


### 使用預先處理的詞向量 (crawl 300 dim)
#### https://fasttext.cc/docs/en/english-vectors.html

In [13]:
# 把embedding_matrix load 近來
embedding_matrix = np.load('dataset/embedding_matrix.npy')
print(type(embedding_matrix))
print(embedding_matrix.shape)
print(embedding_matrix)

<class 'numpy.ndarray'>
(6558, 300)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.0231      0.017       0.0157     ...  0.0744     -0.1118
   0.0963    ]
 [-0.1081      0.0191      0.0354     ...  0.1104      0.0475
  -0.0599    ]
 ...
 [ 0.16580001 -0.0169     -0.4138     ...  0.0933     -0.1168
  -0.1777    ]
 [-0.1179      0.0726     -0.005      ...  0.2079      0.0322
  -0.26879999]
 [ 0.24439999  0.1206      0.1123     ... -0.147      -0.0186
  -0.3204    ]]


### 確認資料、並切割成train、test

In [14]:
# 稽查dataframe、token sequence裡面laptop_test、restaurant_test資料是否一致
# laptop_test第一筆在5915；restaurant_test第一筆在6553
print(data.loc[5915, 'left_text'])
print(data.loc[5915, 'right_text'])
print(left_text_seq[5915])
print(right_text_seq[5915])
print()
print(data.loc[6553, 'left_text'])
print(data.loc[6553, 'right_text'])
print(left_text_seq[6553])
print(right_text_seq[6553])

boot time
boot time is super fast around anywhere from 35 seconds to 1 minute
[500  98   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[1318  434    4 1017 2018   44  844  261  139  532    5   98  500    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]

the bread
bread is top notch as well
[  1 309   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0 

In [15]:
# 把label轉成numpy
Y = data['label'].to_numpy()
print('Shape of Y:', Y.shape)
for i in range(10):
    print(data.loc[i, 'label'], Y[i])
#[1 0 0] = negative
#[0 1 0] = neutral
#[0 0 1] = positve

Shape of Y: (7673,)
1 1
2 2
0 0
0 0
1 1
2 2
2 2
2 2
2 2
2 2


In [16]:
#把資料切割成train、test
X_left_train = left_text_seq[:5915]
X_right_train = right_text_seq[:5915]
Y_train = Y[:5915]
X_left_test = left_text_seq[5915:]
X_right_test = right_text_seq[5915:]
Y_test = Y[5915:]
print(len(X_left_train), len(X_right_train), len(Y_train))
print(len(X_left_test), len(X_right_test), len(Y_test))

5915 5915 5915
1758 1758 1758


In [17]:
# 檢查polarity跟label有沒有不一樣
print('laptop_test', '   ','restaurant_test')
for i in range(20):
    print(laptop_test.loc[i, 'polarity'], data.loc[5915+i, 'label'], Y_test[i], '  ', restaurant_test.loc[i, 'polarity'], data.loc[6553+i, 'label'], Y_test[638+i])

laptop_test     restaurant_test
positive 2 2    positive 2 2
negative 0 0    positive 2 2
positive 2 2    positive 2 2
negative 0 0    positive 2 2
negative 0 0    positive 2 2
negative 0 0    positive 2 2
positive 2 2    positive 2 2
negative 0 0    positive 2 2
neutral 1 1    positive 2 2
positive 2 2    positive 2 2
positive 2 2    neutral 1 1
positive 2 2    positive 2 2
positive 2 2    positive 2 2
positive 2 2    positive 2 2
positive 2 2    negative 0 0
positive 2 2    positive 2 2
negative 0 0    neutral 1 1
negative 0 0    neutral 1 1
positive 2 2    positive 2 2
positive 2 2    positive 2 2


## Model

In [18]:
import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Flatten, InputLayer, Bidirectional, concatenate, add, average, Reshape
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model

### 把兩邊input concate起來，有加上dropout的模型

In [19]:
# first input model 1
input_layer_1 = Input(shape = (max_seq_length,), dtype='int64')
embedding_1 = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], mask_zero=True, trainable=True)(input_layer_1)
lstm_hidden_1 = LSTM(512, return_sequences=False, dropout=0.3)(embedding_1) 
# lstm_hidden_1 = Bidirectional(LSTM(512, return_sequences=False, dropout=0.4))(embedding_1) 

#second input model 2
input_layer_2 = Input(shape = (max_seq_length,), dtype='int64')
embedding_2 = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], mask_zero=True, trainable=True)(input_layer_2)
lstm_hidden_2 = LSTM(512, return_sequences=False, dropout=0.3)(embedding_2)
# lstm_hidden_2 = Bidirectional(LSTM(512, return_sequences=False, dropout=0.4))(embedding_2)

#merge input model
averaged = concatenate([lstm_hidden_1, lstm_hidden_2])
hidden_1 = Dense(128, activation='relu')(averaged)
dropout_1 = Dropout(0.3)(hidden_1)
hidden_2 = Dense(64, activation='relu')(dropout_1)
dropout_2 = Dropout(0.3)(hidden_2)
output = Dense(3, activation='softmax')(dropout_2)
model1 = Model(inputs=[input_layer_1, input_layer_2], outputs=output)
print(model1.summary())
adam = Adam(lr=1e-2)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model1.compile(loss=loss, optimizer=optimizer, metrics=[metric])

early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=1, restore_best_weights=True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 80, 300)      1967400     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 80, 300)      1967400     input_2[0][0]                    
______________________________________________________________________________________________

In [20]:
model1_fit = model1.fit([X_left_train, X_right_train],Y_train, batch_size=64,epochs=30,
                      validation_data=([X_left_test, X_right_test],Y_test), callbacks=[early_stopping])

Train on 5915 samples, validate on 1758 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 00018: early stopping


### 看confusion matrix

In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [22]:
# 取所有label中的test label
Y_label = data['label'].to_numpy()[5915:]

# test data confusion matrix
predictions= model1.predict([X_left_test, X_right_test]) # 輸出的是n*5的編碼值array
print('三元分類還沒argmax output')
print(predictions)
print()
predictions = np.argmax(predictions, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(Y_label, predictions))
print(confusion_matrix(Y_label, predictions))
print(classification_report(Y_label, predictions))


三元分類還沒argmax output
[[9.3566136e-08 9.4068973e-08 9.9999976e-01]
 [9.9720287e-01 2.1214650e-03 6.7564804e-04]
 [2.1758662e-09 4.6127449e-09 1.0000000e+00]
 ...
 [8.1276925e-20 8.8947807e-19 1.0000000e+00]
 [1.3840235e-19 1.8584126e-18 1.0000000e+00]
 [1.4807888e-14 1.3286274e-13 1.0000000e+00]]

0.7440273037542662
[[191  59  74]
 [ 65 163 137]
 [ 53  62 954]]
              precision    recall  f1-score   support

           0       0.62      0.59      0.60       324
           1       0.57      0.45      0.50       365
           2       0.82      0.89      0.85      1069

    accuracy                           0.74      1758
   macro avg       0.67      0.64      0.65      1758
weighted avg       0.73      0.74      0.73      1758



In [23]:
# 取所有label中的laptop test lable
laptop_label = data['label'].to_numpy()[5915:6553]

# laptop test data confusion matrix
predictions= model1.predict([X_left_test[:638], X_right_test[:638]]) # 輸出的是n*5的編碼值array
predictions = np.argmax(predictions, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(laptop_label, predictions))
print(confusion_matrix(laptop_label, predictions))
print(classification_report(laptop_label, predictions))

0.6865203761755486
[[ 68  34  26]
 [ 38  86  45]
 [ 21  36 284]]
              precision    recall  f1-score   support

           0       0.54      0.53      0.53       128
           1       0.55      0.51      0.53       169
           2       0.80      0.83      0.82       341

    accuracy                           0.69       638
   macro avg       0.63      0.62      0.63       638
weighted avg       0.68      0.69      0.68       638



In [24]:
# 取所有label中的restaurant test lable
restaurant_label = data['label'].to_numpy()[6553:]

# restaurant test data confusion matrix
predictions= model1.predict([X_left_test[638:], X_right_test[638:]]) # 輸出的是n*5的編碼值array
predictions = np.argmax(predictions, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(restaurant_label, predictions))
print(confusion_matrix(restaurant_label, predictions))
print(classification_report(restaurant_label, predictions))

0.7767857142857143
[[123  25  48]
 [ 27  77  92]
 [ 32  26 670]]
              precision    recall  f1-score   support

           0       0.68      0.63      0.65       196
           1       0.60      0.39      0.48       196
           2       0.83      0.92      0.87       728

    accuracy                           0.78      1120
   macro avg       0.70      0.65      0.67      1120
weighted avg       0.76      0.78      0.76      1120



### get train、test LSTM 64 dimension output

In [26]:
layer_output=model1.get_layer('dense').output
intermediate_model = Model(inputs=[input_layer_1, input_layer_2],outputs=layer_output)
intermediate_prediction=intermediate_model.predict([left_text_seq, right_text_seq])

In [27]:
print(type(intermediate_prediction))
print(intermediate_prediction.shape)
print(intermediate_prediction)

<class 'numpy.ndarray'>
(7673, 128)
[[0.6153577  0.         0.         ... 0.5140676  0.         0.0568014 ]
 [0.         0.         0.04505857 ... 0.         0.10509332 0.02125632]
 [0.41109017 0.         0.52117074 ... 0.2768108  0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         2.6969218  1.293426  ]
 [0.         0.         0.         ... 0.         2.6723366  1.2036746 ]
 [0.         0.         0.         ... 0.         1.9964995  0.74411166]]


### 把LSTM dimension 放進dataframe

In [28]:
# 把LSTM predict出來的東西放進dataframe
data['lstm_predict'] = 'N/A'
for i in range(len(data)):
    data['lstm_predict'][i] = intermediate_prediction[i]
data.tail(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text,label,lstm_predict
7663,"Anyway, the owner was fake.",owner,negative,anyway the owner,owner was fake,anyway the owner owner was fake,0,"[0.29783994, 0.0, 0.026724296, 0.053328127, 0...."
7664,Owner is pleasant and entertaining.,Owner,positive,owner,owner is pleasant and entertaining,owner owner is pleasant and entertaining,2,"[0.0, 0.0, 0.0, 2.14256, 0.3379526, 0.0, 2.005..."
7665,"I have never in my life sent back food before,...",food,negative,i have never in my life sent back food,food before but i simply had to and the waiter...,i have never in my life sent back food food be...,0,"[0.9518227, 0.0, 0.0, 0.4208995, 1.1578953, 0...."
7666,"I have never in my life sent back food before,...",waiter,negative,i have never in my life sent back food before ...,waiter argued with me over this,i have never in my life sent back food before ...,0,"[0.0, 0.0, 0.0, 0.8968659, 0.0, 0.19021267, 0...."
7667,"Although the restaurant itself is nice, I pref...",food,negative,although the restaurant itself is nice i prefe...,food,although the restaurant itself is nice i prefe...,0,"[0.0, 0.0, 0.0, 0.6106661, 0.40160874, 0.0, 0...."
7668,"Creamy appetizers--taramasalata, eggplant sala...",Creamy appetizers,positive,creamy appetizers,creamy appetizers taramasalata eggplant salad...,creamy appetizers creamy appetizers taramasal...,2,"[0.0, 0.0, 0.0, 2.5973647, 0.55685735, 0.0, 2...."
7669,"Creamy appetizers--taramasalata, eggplant sala...",warm pitas,neutral,creamy appetizers taramasalata eggplant salad...,warm pitas,creamy appetizers taramasalata eggplant salad...,1,"[0.0, 0.0, 0.0, 2.095603, 1.0954943, 0.0, 0.96..."
7670,"Creamy appetizers--taramasalata, eggplant sala...",taramasalata,positive,creamy appetizers taramasalata,taramasalata eggplant salad and greek yogurt w...,creamy appetizers taramasalata taramasalata e...,2,"[0.0, 0.0, 0.0, 3.1061869, 0.5915099, 0.0, 2.3..."
7671,"Creamy appetizers--taramasalata, eggplant sala...",eggplant salad,positive,creamy appetizers taramasalata eggplant salad,eggplant salad and greek yogurt with cuccumber...,creamy appetizers taramasalata eggplant salad...,2,"[0.0, 0.0, 0.0, 3.0199537, 0.6858778, 0.0, 2.4..."
7672,"Creamy appetizers--taramasalata, eggplant sala...","Greek yogurt (with cuccumber, dill, and garlic)",positive,creamy appetizers taramasalata eggplant salad...,greek yogurt with cuccumber dill and garlic ta...,creamy appetizers taramasalata eggplant salad...,2,"[0.0, 0.0, 0.0, 2.3910232, 0.80148387, 0.0, 1...."


In [29]:
# 稽查
n = 504
print(data.loc[n, 'lstm_predict'].shape)
print(len(data.loc[n, 'lstm_predict']))
print(data.loc[n, 'lstm_predict'])

(128,)
128
[0.         0.         0.         2.719978   1.029768   0.
 1.9912829  0.1033069  0.         0.         2.8324864  0.
 0.         2.8872955  0.         0.         4.0292883  0.
 1.6299099  3.2112064  0.15822892 3.1007688  0.         2.0963035
 0.         0.         0.         3.7101977  3.2595015  0.
 1.8803552  0.         2.1338663  0.         3.766756   0.95988923
 0.14947261 0.         0.         0.         0.         2.5026848
 4.1936526  0.         1.1535277  0.         3.0017245  1.0984769
 3.2968705  0.         1.8785983  0.         4.584175   0.
 4.1292644  0.7932159  4.082077   2.6583815  0.         0.
 0.         0.         0.         0.         0.         2.8812826
 0.         1.3314581  0.         3.7014701  0.         0.
 0.         4.067122   3.3511631  0.24262889 0.         0.68987805
 0.         0.         3.6477823  0.85097045 0.         0.
 2.9952211  0.05694206 3.7964575  3.4318027  0.         1.8393604
 3.1541011  0.         0.         0.         0.      

### Bert前處理

In [30]:
from transformers import BertTokenizer, BertModel, TFBertForSequenceClassification, TFBertModel

In [31]:
# Load pre-trained model tokenizer, to convert our text into tokens that correspond to BERT’s vocabulary.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### 找出單句最多token

In [32]:
# 找出最多text add aspect中最多是幾個token，不包含CLS跟SEP
def find_max_token(pd):
    max_token = 0
    index = 0
    for i in range(len(pd)):
        text = pd.loc[i, 'text']
        aspect = pd.loc[i, 'aspect']
        text_aspect = text + ' ' +aspect
        tokens_len = len(tokenizer.tokenize(text_aspect))
        if tokens_len>max_token:
            max_token = tokens_len
            index = i
    return [max_token, index]

In [33]:
# 找出text add aspect中token最多的是幾個token，不包含CLS跟SEP
max_token = find_max_token(data)
print('資料集token最多與index是:', max_token)

資料集token最多與index是: [99, 6299]


### 正式把資料轉換成token(padding)

#### 把句子轉變成token(CLS+text+SEP+asepct)+(padding)的function

In [34]:
# 把維度固定在128維
input_dim = 128
def input_ids_all(pd):
    pd['input_ids'] = 'N/A'
    for i in range(len(pd)):
        text = pd.loc[i, 'text']
        aspect = pd.loc[i, 'aspect']
        text_tokens = tokenizer.tokenize(text) # 把text轉成token
        aspect_tokens = tokenizer.tokenize(aspect) # 把aspect轉成token
        
        text_input_ids = tokenizer.convert_tokens_to_ids(text_tokens) # 把text token轉成text token id
        aspect_input_ids = tokenizer.convert_tokens_to_ids(aspect_tokens) # 把aspect token轉成aspect token id
        
        text_input_ids_cls = tokenizer.build_inputs_with_special_tokens(text_input_ids) # aspect token id加上CLS、SEP token id
        input_ids = text_input_ids_cls + aspect_input_ids # 把aspect token id接在text token id 後面 (CLS+text+SEP+aspect)
        input_ids = np.array(input_ids)
        
        if len(input_ids) < input_dim:
            n = input_dim - len(input_ids)
            input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0)) # array右邊append n 個 0  補長度到512
        
        pd['input_ids'][i] = input_ids
    return pd

In [35]:
# 將text轉成token，後面加上aspect token存進dataframe
data = input_ids_all(data)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,text,aspect,polarity,left_text,right_text,left_right_text,label,lstm_predict,input_ids
0,I charge it at night and skip taking the cord ...,cord,neutral,i charge it at night and skip taking the cord,cord with me because of the good battery life,i charge it at night and skip taking the cord ...,1,"[0.6153577, 0.0, 0.0, 0.18314853, 1.0045241, 0...","[101, 1045, 3715, 2009, 2012, 2305, 1998, 1355..."
1,I charge it at night and skip taking the cord ...,battery life,positive,i charge it at night and skip taking the cord ...,battery life,i charge it at night and skip taking the cord ...,2,"[0.0, 0.0, 0.04505857, 0.12736586, 0.0, 0.4572...","[101, 1045, 3715, 2009, 2012, 2305, 1998, 1355..."
2,The tech guy then said the service center does...,service center,negative,the tech guy then said the service center,service center does not do 1 to 1 exchange and...,the tech guy then said the service center serv...,0,"[0.41109017, 0.0, 0.52117074, 0.0, 0.03945322,...","[101, 1996, 6627, 3124, 2059, 2056, 1996, 2326..."
3,The tech guy then said the service center does...,"""sales"" team",negative,the tech guy then said the service center does...,sales team which is the retail shop which i bo...,the tech guy then said the service center does...,0,"[0.0, 0.0, 0.7332473, 0.010174245, 0.0, 0.9998...","[101, 1996, 6627, 3124, 2059, 2056, 1996, 2326..."
4,The tech guy then said the service center does...,tech guy,neutral,the tech guy,tech guy then said the service center does not...,the tech guy tech guy then said the service ce...,1,"[0.5331934, 0.014703929, 0.0, 0.2424311, 1.294...","[101, 1996, 6627, 3124, 2059, 2056, 1996, 2326..."


In [36]:
# 稽查
n = 6299
print(data.loc[n, 'text'])
print(data.loc[n, 'aspect'])
print(data.loc[n, 'input_ids'])

The Like New condition of the iMac MC309LL/A on Amazon is at $900+ level only, and it is a Quad-Core 2.5 GHz CPU (similar to the $799 Mini), with Radeon HD 6750M 512MB graphic card (this mini is integrated Intel 4000 card), and it even comes with wireless Apple Keyboard and Mouse, all put together in neat and nice package.
Radeon HD 6750M 512MB graphic card
[  101  1996  2066  2047  4650  1997  1996 10047  6305 11338 14142  2683
  3363  1013  1037  2006  9733  2003  2012  1002  7706  1009  2504  2069
  1010  1998  2009  2003  1037 17718  1011  4563  1016  1012  1019 29066
 17368  1006  2714  2000  1996  1002  6535  2683  7163  1007  1010  2007
 10958  3207  2239 10751  6163 12376  2213 24406 14905  8425  4003  1006
  2023  7163  2003  6377 13420 20143  4003  1007  1010  1998  2009  2130
  3310  2007  9949  6207  9019  1998  8000  1010  2035  2404  2362  1999
 15708  1998  3835  7427  1012   102 10958  3207  2239 10751  6163 12376
  2213 24406 14905  8425  4003     0     0     0     0  

In [37]:
# 把data的input_ids提出存進list
input_ids = list()
for i in range(len(data)):
    np_id = data.loc[i, 'input_ids']
    input_ids.append(np_id)
input_ids = np.array(input_ids)
print(input_ids.shape)
input_ids

(7673, 128)


array([[  101,  1045,  3715, ...,     0,     0,     0],
       [  101,  1045,  3715, ...,     0,     0,     0],
       [  101,  1996,  6627, ...,     0,     0,     0],
       ...,
       [  101, 24519, 10439, ...,     0,     0,     0],
       [  101, 24519, 10439, ...,     0,     0,     0],
       [  101, 24519, 10439, ...,     0,     0,     0]])

In [38]:
# 把data的lstm_predcit提出存進list
lstm_predict = list()
for i in range(len(data)):
    np_lstm = data.loc[i, 'lstm_predict']
    lstm_predict.append(np_lstm)
lstm_predict = np.array(lstm_predict)
print(type(lstm_predict))
print(lstm_predict.shape)
print(lstm_predict)

<class 'numpy.ndarray'>
(7673, 128)
[[0.6153577  0.         0.         ... 0.5140676  0.         0.0568014 ]
 [0.         0.         0.04505857 ... 0.         0.10509332 0.02125632]
 [0.41109017 0.         0.52117074 ... 0.2768108  0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         2.6969218  1.293426  ]
 [0.         0.         0.         ... 0.         2.6723366  1.2036746 ]
 [0.         0.         0.         ... 0.         1.9964995  0.74411166]]


In [39]:
# 把data label變成numpy
label = data['label'].to_numpy()
print(len(label))
label

7673


array([1, 2, 0, ..., 2, 2, 2])

### 切train、test資料

In [40]:
# X
train_input_ids = input_ids[:5915]
test_input_ids = input_ids[5915:]
print(train_input_ids.shape)
print(train_input_ids)
print()
print(test_input_ids.shape)
print(test_input_ids)

(5915, 128)
[[ 101 1045 3715 ...    0    0    0]
 [ 101 1045 3715 ...    0    0    0]
 [ 101 1996 6627 ...    0    0    0]
 ...
 [ 101 2169 2795 ...    0    0    0]
 [ 101 2169 2795 ...    0    0    0]
 [ 101 2169 2795 ...    0    0    0]]

(1758, 128)
[[  101  9573  2051 ...     0     0     0]
 [  101  6627  2490 ...     0     0     0]
 [  101  2275  2039 ...     0     0     0]
 ...
 [  101 24519 10439 ...     0     0     0]
 [  101 24519 10439 ...     0     0     0]
 [  101 24519 10439 ...     0     0     0]]


In [41]:
# lstm predict
train_lstm_predict = lstm_predict[:5915]
test_lstm_predict = lstm_predict[5915:]
print(train_lstm_predict.shape)
print(test_lstm_predict.shape)

(5915, 128)
(1758, 128)


In [42]:
# Y
train_label = label[:5915]
test_label = label[5915:]
print(train_label.shape)
print(train_label)
print()
print(test_label.shape)
print(test_label)

(5915,)
[1 2 0 ... 1 1 1]

(1758,)
[2 0 2 ... 2 2 2]


In [43]:
# 檢查polarity跟label有沒有不一樣
print('test_data')
for i in range(15):
    print(test_data.loc[i, 'polarity'], data.loc[5915+i, 'label'], test_label[i])

test_data
positive 2 2
negative 0 0
positive 2 2
negative 0 0
negative 0 0
negative 0 0
positive 2 2
negative 0 0
neutral 1 1
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2
positive 2 2


### Model

In [92]:
input_layer= Input(shape = (128,), dtype='int64')
# lstm_input_layer = Input(shape = (128,), dtype='float32')
# print(type(input_layer))
# print(type(lstm_input_layer))
bert = TFBertModel.from_pretrained('bert-base-uncased')(input_layer)
bert = bert[0]
dropout = Dropout(0.1)(bert)
flat = Flatten()(dropout)
dense_1 = Dense(units=128)(flat)
# print(type(dense_1))

# merge = concatenate([dense_1, lstm_input_layer])
classifier = Dense(units=3)(dense_1) # 分3類
model = Model(inputs=input_layer, outputs=classifier)
model.summary()


optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

Model: "model_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        [(None, 128)]             0         
_________________________________________________________________
tf_bert_model_5 (TFBertModel ((None, 128, 768), (None, 109482240 
_________________________________________________________________
dropout_229 (Dropout)        (None, 128, 768)          0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 98304)             0         
_________________________________________________________________
dense_19 (Dense)             (None, 128)               12583040  
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 387       
Total params: 122,065,667
Trainable params: 122,065,667
Non-trainable params: 0
____________________________________________

In [51]:
# # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# # num_labels=3 分3類
# model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# model.summary()

# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [93]:
model_fit = model.fit(train_input_ids, train_label, 
                      batch_size=4, epochs=5, 
                      validation_data=(test_input_ids, test_label), callbacks=[early_stopping])

Train on 5915 samples, validate on 1758 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### get train、test BERT 128 dimension output

In [95]:
layer_output=model.get_layer('dense_19').output
intermediate_model = Model(inputs=input_layer,outputs=layer_output)
intermediate_prediction=intermediate_model.predict(input_ids)

In [96]:
bert_feature = intermediate_prediction
print(type(bert_feature))
print(bert_feature.shape)
print(bert_feature)

<class 'numpy.ndarray'>
(7673, 128)
[[ 0.68260163 -0.4627373   0.16000712 ... -1.1491781   0.05248769
  -0.263877  ]
 [-0.46024865  0.53016216 -0.36681974 ...  0.2633916   1.4116355
   0.66534805]
 [ 0.46934295 -0.55190027  0.18329841 ... -0.6966846   0.54983187
   0.01628807]
 ...
 [-0.28497407  0.33559287 -0.6235323  ...  1.285512    1.2011584
   0.42756274]
 [-0.223069   -0.2866009  -0.2205159  ...  1.6681943   0.9824842
   0.05190899]
 [-0.44749096  0.9202354  -0.51188177 ...  1.4626744   1.6955111
   0.9330041 ]]


In [97]:
train_bert_predict = bert_feature[:5915]
test_bert_predict = bert_feature[5915:]
print(train_bert_predict.shape)
print(test_bert_predict.shape)

(5915, 128)
(1758, 128)


In [98]:
print(train_lstm_predict.shape)
print(test_lstm_predict.shape)

(5915, 128)
(1758, 128)


### 把兩個訓練好的模型輸出丟到dense訊練

In [100]:
lstm_input_layer = Input(shape = (128,), dtype='float32')
bert_input_layer = Input(shape = (128,), dtype='float32')
merge = concatenate([lstm_input_layer, bert_input_layer])
classifier = Dense(units=3)(merge) # 分3類
modelfix = Model(inputs=[lstm_input_layer, bert_input_layer], outputs=classifier)
print(modelfix.summary())
# adam = Adam(lr=1e-2)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
modelfix.compile(loss=loss, optimizer=optimizer, metrics=[metric])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, verbose=1, restore_best_weights=True)

Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           [(None, 128)]        0                                            
__________________________________________________________________________________________________
concatenate_8 (Concatenate)     (None, 256)          0           input_23[0][0]                   
                                                                 input_24[0][0]                   
__________________________________________________________________________________________________
dense_22 (Dense)                (None, 3)            771         concatenate_8[0][0]       

In [101]:
modelfix_fit = modelfix.fit([train_lstm_predict, train_bert_predict],Y_train, batch_size=64,epochs=30,
                      validation_data=([test_lstm_predict, test_bert_predict],Y_test), callbacks=[early_stopping])

Train on 5915 samples, validate on 1758 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 00012: early stopping


In [102]:
# test data confusion
predictions_test= modelfix.predict([test_lstm_predict,test_bert_predict]) # 輸出的是n*5的編碼值array
predictions_test = np.argmax(predictions_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(test_label, predictions_test))
print(confusion_matrix(test_label, predictions_test))
print(classification_report(test_label, predictions_test))

0.8191126279863481
[[265  26  33]
 [ 86 183  96]
 [ 42  35 992]]
              precision    recall  f1-score   support

           0       0.67      0.82      0.74       324
           1       0.75      0.50      0.60       365
           2       0.88      0.93      0.91      1069

    accuracy                           0.82      1758
   macro avg       0.77      0.75      0.75      1758
weighted avg       0.82      0.82      0.81      1758



In [103]:
# laptop_test confusion
laptop_test_lstm_predict = test_lstm_predict[:638]
laptop_test_bert_predict = test_bert_predict[:638]
laptop_test_label = test_label[:638]
predictions_lap_test = modelfix.predict([laptop_test_lstm_predict, laptop_test_bert_predict])
predictions_lap_test = np.argmax(predictions_lap_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(laptop_test_label, predictions_lap_test))
print(confusion_matrix(laptop_test_label, predictions_lap_test))
print(classification_report(laptop_test_label, predictions_lap_test))

0.7664576802507836
[[106  13   9]
 [ 52  81  36]
 [ 19  20 302]]
              precision    recall  f1-score   support

           0       0.60      0.83      0.70       128
           1       0.71      0.48      0.57       169
           2       0.87      0.89      0.88       341

    accuracy                           0.77       638
   macro avg       0.73      0.73      0.72       638
weighted avg       0.77      0.77      0.76       638



In [104]:
# laptop_test confusion
restaurant_test_lstm_predict = test_lstm_predict[638:]
restaurant_test_bert_predict = test_bert_predict[638:]
restaurant_test_label = test_label[638:]
predictions_res_test = modelfix.predict([restaurant_test_lstm_predict, restaurant_test_bert_predict])
predictions_res_test = np.argmax(predictions_res_test, axis=1) # axis = 1是取行的最大值的索引，0是列的最大值的索引
print(accuracy_score(restaurant_test_label, predictions_res_test))
print(confusion_matrix(restaurant_test_label, predictions_res_test))
print(classification_report(restaurant_test_label, predictions_res_test))

0.8491071428571428
[[159  13  24]
 [ 34 102  60]
 [ 23  15 690]]
              precision    recall  f1-score   support

           0       0.74      0.81      0.77       196
           1       0.78      0.52      0.63       196
           2       0.89      0.95      0.92       728

    accuracy                           0.85      1120
   macro avg       0.80      0.76      0.77      1120
weighted avg       0.85      0.85      0.84      1120

