# 影评资料集(IMDB movie review)情绪分析 

In [None]:
# 载入相关套件
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 参数设定
batch_size = 128            # 批量
embedding_output_dims = 15  # 嵌入层输出维度
max_sequence_length = 300   # 句子最大字数
num_distinct_words = 5000   # 字典
number_of_epochs = 5        # 训练执行周期
validation_split = 0.20     # 验证资料比例
verbosity_mode = 1          # 训练资料讯息显示程度

In [None]:
# 载入 IMDB 影评资料集，TensorFlow 已将资料转为索引值
(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words=num_distinct_words)
print(x_train.shape)
print(x_test.shape)

# 长度不足时补 0
padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length
                              , value = 0.0) 
padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length
                                   , value = 0.0) 

# 建立模型
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, 
                    input_length=max_sequence_length))
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))

# 指定优化器、损失函数
model.compile(optimizer=Adam(), loss=BinaryCrossentropy, metrics=['accuracy'])

# 模型汇总资讯
model.summary()

In [26]:
y_test

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [1]:
# 训练模型
history = model.fit(padded_inputs, y_train, batch_size=batch_size, 
            epochs=number_of_epochs, verbose=verbosity_mode, 
            validation_split=validation_split)

# 模型评估
test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Loss: {test_results[0]}, Accuracy: {100*test_results[1]}%')

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


(25000,)
(25000,)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 15)           75000     
_________________________________________________________________
lstm (LSTM)                  (None, 10)                1040      
_________________________________________________________________
dense (Dense)                (None, 1)                 11        
Total params: 76,051
Trainable params: 76,051
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test results - Loss: 0.33703479669570924 - Accuracy: 86.62800192832947%


In [2]:
# 模型存档
model.save('LSTM_IMDB.h5')

In [7]:
# 取得字词与索引的对照表字典
imdb_dict = imdb.get_word_index()
list(imdb_dict.keys())[:10]

['fawn',
 'tsukino',
 'nunnery',
 'sonja',
 'vani',
 'woods',
 'spiders',
 'hanging',
 'woody',
 'trawling']

In [28]:
# 反转字典，变成索引与字词的对照表
imdb_dict_reversed = {}
for k, v in imdb_dict.items():
    imdb_dict_reversed[v] = k

In [92]:
# 还原测试资料前两笔为文字
text = []
for i, line in enumerate(padded_inputs_test[:2]):
    text.append('')
    for j, word in enumerate(line):
        if word != 0:
            text[i] += imdb_dict_reversed[word]+','
        else:
            text[i] += ' ,'
text

[" , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,the,wonder,own,as,by,is,sequence,i,i,and,and,to,of,hollywood,br,of,down,and,getting,boring,of,ever,it,sadly,sadly,sadly,i,i,was,then,does,don't,close,and,after,one,carry,as,by,are,be,and,all,family,turn,in,does,as,three,part,in,another,some,to,be,probably,with,world,and,her,an,have,and,beginning,own,as,is,sequence,",
 " , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,the,as,you,world's,is,quite,br,and,most,that,quest,are,chase,to,being,quickly,of,little,it,time,hell,to,plot,br,of,something,long,put

In [93]:
imdb_dict_reversed[14]

'as'

In [94]:
# 以上述语句测试
X_tokens = []
for line in text:
    tokens = nltk.word_tokenize(line)
    tokens = [token.strip() for token in tokens]
    X_tokens.append(tokens)
    
# 转为索引值
import numpy as np
X_index = np.zeros((len(text), max_sequence_length))
for i, line in enumerate(X_tokens):
    for j, word in enumerate(line):
        if j >= max_sequence_length:
            break
        if word in imdb_dict:
            X_index[i, j] = imdb_dict[word]


In [95]:
# 长度不足时补 0
padded_inputs = pad_sequences(X_index, maxlen=max_sequence_length, 
                      padding=pad_type, truncating=trunc_type, value = 0.0) 

# 预测
np.argmax(model.predict(padded_inputs), axis=-1)

array([[0],
       [1]])

In [96]:
# 以原资料预测，确认答案相同
np.argmax(model.predict(padded_inputs_test[:2]), axis=-1)

array([[0],
       [1]])