In [1]:
# 导入数据以后运行模型，选一个运行，测试准确率。若要画roc曲线，先算概率

# 数据导入(官网方案)

In [1]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
import numpy as np

Using TensorFlow backend.


In [2]:
# 设置词汇表的长度，在数据预处理过程中，选择词汇字典中前max_features索引的词汇。
max_features = 3800
# 在此数量的单词之后剪切文本（取最常见的 max_features 个单词）
maxlen = 38
batch_size = 100
# 设置词向量的维度
embedding_dims = 32
# CNN用得到的
filters=250
kernel_size=3
# 设置全连接层中，神经元的个数
hidden_dims=250
epochs=10

In [3]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 38)
x_test shape: (25000, 38)


## 建立模型（MLP）

In [26]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation,Flatten
from keras.layers.embeddings import Embedding

In [27]:
model = Sequential() # 此建立一个线性堆叠模型

In [28]:
# 加入嵌入层
# 输入的维数为output_dim，意思是希望将数字列表转换为32维的向量
# 输入的维数是input_dim，预处理建立的字典数目为准
# “数字列表”每一项有input_length个数字
# dropout层避免过度拟合---放弃20%的神经元

In [29]:
model.add(Embedding(output_dim=embedding_dims,
                    input_dim=max_features, 
                    input_length=maxlen))
model.add(Dropout(0.2))

In [30]:
# 平坦层模型
# 平坦层的神经元数量==数字列表每项数目maxlen * 每个数字维度output_dim

In [31]:
model.add(Flatten())

In [32]:
# 隐藏层
# 有units个神经元
# 激活函数为activation，如ReLU

In [33]:
model.add(Dense(units=256,
                activation='relu' ))
model.add(Dropout(0.2))

In [34]:
# 输出层
# 一个神经元，1代表正面评价，0代表负面评价
# 也要定义激活函数，如sigmoid

In [35]:
model.add(Dense(units=1,
                activation='sigmoid' ))

In [36]:
model.summary() # 查看模型的摘要

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 38, 32)            121600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 38, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1216)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               311552    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 433,409
Trainable params: 433,409
Non-trainable params: 0
________________________________________________

## 建立模型（RNN）

In [20]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

In [21]:
model = Sequential()

In [22]:
model.add(Embedding(output_dim=embedding_dims,
                    input_dim=max_features, 
                    input_length=maxlen))
model.add(Dropout(0.35))

In [23]:
# 建立16个神经元的RNN层

In [24]:
model.add(SimpleRNN(units=16))

In [25]:
model.add(Dense(units=256,activation='relu' ))
model.add(Dropout(0.35))

In [26]:
model.add(Dense(units=1,activation='sigmoid' ))

In [27]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 38, 32)            121600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 38, 32)            0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 126,993
Trainable params: 126,993
Non-trainable params: 0
________________________________________________

## 建立模型（LSTM）

In [4]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation,Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

In [5]:
model = Sequential()

In [6]:
model.add(Embedding(output_dim=embedding_dims,
                    input_dim=max_features, 
                    input_length=maxlen))
model.add(Dropout(0.2))

In [7]:
model.add(LSTM(32))

In [8]:
model.add(Dense(units=256,
                activation='relu' ))
model.add(Dropout(0.2))

In [9]:
model.add(Dense(units=1,
                activation='sigmoid' ))

In [10]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 38, 32)            121600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 38, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 138,625
Trainable params: 138,625
Non-trainable params: 0
________________________________________________

## 建立模型（CNN）

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation
from keras.layers import Embedding 
from keras.layers import Conv1D,GlobalMaxPooling1D

In [28]:
# 采用序列模型
model=Sequential()

In [29]:
# 添加词嵌入层，词嵌入层只能作为神经网络的第一层
model.add(Embedding(max_features,embedding_dims,input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(filters,kernel_size,padding="valid",activation='relu',strides=1))
# 使用maxpooling
model.add(GlobalMaxPooling1D())

NameError: name 'Conv1D' is not defined

In [None]:
# 添加全连接层
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# 输出层
model.add(Dense(1))
model.add(Activation("sigmoid"))

In [30]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 38, 32)            121600    
_________________________________________________________________
dropout_5 (Dropout)          (None, 38, 32)            0         
Total params: 121,600
Trainable params: 121,600
Non-trainable params: 0
_________________________________________________________________


## 训练模型

In [11]:
# 对训练模型进行设置
# loss:设置损失函数，深度学习中使用cross_entropy训练比较好
# optimizer:使用adam优化器可以让训练更快收敛，并且提高准确率
# metrics:设置评估模型的方式，如：准确率

In [12]:
# Adam：keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
# lr：大或等于0的浮点数，学习率
# beta_1/beta_2：浮点数， 0<beta<1，通常很接近1
# epsilon：大或等于0的小浮点数，防止除0错误

In [13]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [14]:
# 开始训练
# epoch：训练周期
# batch_size：每一批次的数据
# verbose显示训练过程

In [15]:
# 训练之前Keras会自动分数据：80%作为训练数据，20%为验证数据。所以20k作为训练数据，5k作为验证数据。
# Epoch训练完了以后，会计算这个周期的准确率和误差，acc和loss，并且在train_history新增一项数据记录。所以有2个acc，loss，一个是训练集一个是验证集

In [16]:
# validation_split：（0~1之间的浮点数）指定训练集的一定比例数据作为验证集。验证集不参与训练，并在每个epoch结束后测试模型的指标。上条已说。
# validation_data：形式为（x,y），是指定的验证集。此参数将覆盖validation_split

In [17]:
train_history =model.fit(x_train, y_train,batch_size=batch_size, 
                         epochs=epochs,verbose=2,
                         validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 5s - loss: 0.5319 - accuracy: 0.7168 - val_loss: 0.4584 - val_accuracy: 0.7756
Epoch 2/10
 - 4s - loss: 0.3987 - accuracy: 0.8224 - val_loss: 0.4486 - val_accuracy: 0.7794
Epoch 3/10
 - 4s - loss: 0.3602 - accuracy: 0.8420 - val_loss: 0.4585 - val_accuracy: 0.7862
Epoch 4/10
 - 4s - loss: 0.3240 - accuracy: 0.8629 - val_loss: 0.4690 - val_accuracy: 0.7834
Epoch 5/10
 - 4s - loss: 0.2943 - accuracy: 0.8749 - val_loss: 0.5039 - val_accuracy: 0.7748
Epoch 6/10
 - 4s - loss: 0.2685 - accuracy: 0.8849 - val_loss: 0.5471 - val_accuracy: 0.7700
Epoch 7/10
 - 4s - loss: 0.2429 - accuracy: 0.8967 - val_loss: 0.6272 - val_accuracy: 0.7706
Epoch 8/10
 - 4s - loss: 0.2189 - accuracy: 0.9059 - val_loss: 0.6002 - val_accuracy: 0.7662
Epoch 9/10
 - 4s - loss: 0.2023 - accuracy: 0.9140 - val_loss: 0.6383 - val_accuracy: 0.7624
Epoch 10/10
 - 4s - loss: 0.1769 - accuracy: 0.9241 - val_loss: 0.8672 - val_accuracy: 0.7642


In [18]:
# 评估模型准确率

In [19]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.7692000269889832

## 进行预测

In [None]:
predict=model.predict_classes(x_test)

In [None]:
# 以上结果是二维数组，reshape可以将其转化为一维数组predict_classes

In [None]:
predict_classes=predict.reshape(-1)
predict_classes[:10]

## 预测概率(画ROC曲线时有用)

In [None]:
probility=model.predict(x_test)

## 画ROC曲线

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random
import numpy as np

In [None]:
y = np.array(y_test)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y, probility)
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()