In [1]:
import preprocessing as ps
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D

Using TensorFlow backend.


In [2]:
titles_train, labels_train = ps.read_data('../train.txt')
titles_test, labels_test = ps.read_data('../test.txt')
titles_val, labels_val = ps.read_data('../val.txt')

In [3]:
words, word_to_id, id_to_word = ps.get_words(titles_train + titles_test + titles_val)
class_set, cls_to_id, id_to_cls = ps.get_classes(labels_val)

In [4]:
X_train, y_train = ps.tokenize(titles_train, labels_train, word_to_id, cls_to_id, len(class_set))
X_test, y_test = ps.tokenize(titles_test, labels_test, word_to_id, cls_to_id, len(class_set))
X_val, y_val = ps.tokenize(titles_val, labels_val, word_to_id, cls_to_id, len(class_set))

In [6]:
ngram_range = 2
max_features = len(words)
batch_size = 32
embedding_dims = 50
epochs = 10
maxlen = 40

In [7]:
token_indice, max_features = ps.build_ngram_tokens(X_train, max_features, ngram_range)
X_train = ps.pad_ngram_data(X_train, token_indice, maxlen, ngram_range)
X_test = ps.pad_ngram_data(X_test, token_indice, maxlen, ngram_range)
X_val = ps.pad_ngram_data(X_val, token_indice, maxlen, ngram_range)

Adding 2-gram features


In [8]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [9]:
# 构建模型
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# 先从一个高效的嵌入层开始，它将词汇表索引映射到 embedding_dim 维度的向量上
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
# 添加一个 GlobalAveragePooling1D 层，它将平均整个序列的词嵌入
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
# 投影到一个单神经元输出层，然后使用 sigmoid 挤压。
model.add(Dense(10, activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()  # 概述

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            11429500  
_________________________________________________________________
global_average_pooling1d_1 ( (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 11,430,010.0
Trainable params: 11,430,010.0
Non-trainable params: 0.0
_________________________________________________________________


In [10]:
# 训练与验证
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_val, y_val), verbose=2)

Train on 50000 samples, validate on 5000 samples
Epoch 1/10
29s - loss: 1.3605 - acc: 0.6837 - val_loss: 0.7641 - val_acc: 0.7914
Epoch 2/10
28s - loss: 0.4595 - acc: 0.8816 - val_loss: 0.5645 - val_acc: 0.8336
Epoch 3/10
27s - loss: 0.2401 - acc: 0.9387 - val_loss: 0.5085 - val_acc: 0.8482
Epoch 4/10
28s - loss: 0.1303 - acc: 0.9699 - val_loss: 0.4962 - val_acc: 0.8506
Epoch 5/10
28s - loss: 0.0701 - acc: 0.9853 - val_loss: 0.5108 - val_acc: 0.8518
Epoch 6/10
28s - loss: 0.0390 - acc: 0.9923 - val_loss: 0.5329 - val_acc: 0.8530
Epoch 7/10
28s - loss: 0.0234 - acc: 0.9950 - val_loss: 0.5558 - val_acc: 0.8530
Epoch 8/10
28s - loss: 0.0155 - acc: 0.9961 - val_loss: 0.5842 - val_acc: 0.8492
Epoch 9/10
27s - loss: 0.0117 - acc: 0.9966 - val_loss: 0.6154 - val_acc: 0.8492
Epoch 10/10
28s - loss: 0.0097 - acc: 0.9967 - val_loss: 0.6437 - val_acc: 0.8474


<keras.callbacks.History at 0x1a11dae6e48>

In [11]:
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.642158113983
Test accuracy: 0.8408
