In [1]:
import preprocessing as ps
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D

Using TensorFlow backend.


In [2]:
titles_train, labels_train = ps.read_data('../train.txt')
titles_test, labels_test = ps.read_data('../test.txt')
titles_val, labels_val = ps.read_data('../val.txt')

In [3]:
words, word_to_id, id_to_word = ps.get_words(titles_train + titles_test + titles_val)
class_set, cls_to_id, id_to_cls = ps.get_classes(labels_val)

In [4]:
X_train, y_train = ps.tokenize(titles_train, labels_train, word_to_id, cls_to_id, len(class_set))
X_test, y_test = ps.tokenize(titles_test, labels_test, word_to_id, cls_to_id, len(class_set))
X_val, y_val = ps.tokenize(titles_val, labels_val, word_to_id, cls_to_id, len(class_set))

In [5]:
ngram_range = 2
max_features = len(words)
batch_size = 32
embedding_dims = 50
epochs = 5
maxlen = 40

In [6]:
token_indice, max_features = ps.build_ngram_tokens(X_train, max_features, ngram_range)
X_train = ps.pad_ngram_data(X_train, token_indice, maxlen, ngram_range)
X_test = ps.pad_ngram_data(X_test, token_indice, maxlen, ngram_range)
X_val = ps.pad_ngram_data(X_val, token_indice, maxlen, ngram_range)

Adding 2-gram features


In [7]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [8]:
# 构建模型
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# 先从一个高效的嵌入层开始，它将词汇表索引映射到 embedding_dim 维度的向量上
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
# 添加一个 GlobalAveragePooling1D 层，它将平均整个序列的词嵌入
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
# 投影到一个单神经元输出层，然后使用 sigmoid 挤压。
model.add(Dense(10, activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()  # 概述

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            11429500  
_________________________________________________________________
global_average_pooling1d_1 ( (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 11,430,010.0
Trainable params: 11,430,010.0
Non-trainable params: 0.0
_________________________________________________________________


In [9]:
# 训练与验证
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_val, y_val), verbose=2)

Train on 50000 samples, validate on 5000 samples
Epoch 1/5
29s - loss: 1.3643 - acc: 0.6955 - val_loss: 0.7633 - val_acc: 0.7882
Epoch 2/5
27s - loss: 0.4638 - acc: 0.8795 - val_loss: 0.5636 - val_acc: 0.8354
Epoch 3/5
27s - loss: 0.2436 - acc: 0.9380 - val_loss: 0.5054 - val_acc: 0.8462
Epoch 4/5
28s - loss: 0.1325 - acc: 0.9688 - val_loss: 0.4991 - val_acc: 0.8514
Epoch 5/5
27s - loss: 0.0717 - acc: 0.9849 - val_loss: 0.5058 - val_acc: 0.8530


<keras.callbacks.History at 0x23a2d2d6fd0>

In [11]:
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.512693502425
Test accuracy: 0.8461
