In [1]:
import numpy as np
from collections import Counter

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D

ModuleNotFoundError: No module named 'keras'

In [2]:
def read_titles(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        labels = []
        titles = []
        for line in f:
            labels.append(line.split('\t')[0])
            titles.append(line.split('\t')[1])
    return titles, labels

In [3]:
titles_train, labels_train= read_titles('../train.txt')
titles_val, labels_val= read_titles('../val.txt')
titles_test, labels_test= read_titles('../test.txt')

In [4]:
text = ""
for line in titles_train + titles_val + titles_test:
    text += line
text = text.replace('\u3000', '').replace('\t', '').replace('\n', '')

counter = Counter(list(text))
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)

In [5]:
vocab = words
word_to_index = dict((c, i) for i, c in enumerate(vocab))
index_to_word = dict((i, c) for i, c in enumerate(vocab))

In [6]:
labels_set = set(labels_val)
label_to_index = dict((c, i) for i, c in enumerate(labels_set))
index_to_label = dict((i, c) for i, c in enumerate(labels_set))

In [7]:
def vectorization(titles, labels):
    X = []
    Y = np.zeros((len(titles), len(labels_set)))
    for i in range(len(titles)):
        X.append([word_to_index[x] for x in titles[i] if x in word_to_index])
        Y[i][label_to_index[labels[i]]] = 1
    return X, Y

In [8]:
X_train, Y_train = vectorization(titles_train, labels_train)
X_test, Y_test = vectorization(titles_test, labels_test)
X_val, Y_val = vectorization(titles_val, labels_val)
maxlen = max(map(len, X_train + X_test + X_val))

In [9]:
# 构建 ngram 数据集
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    从一个整数列表中提取  n-gram 集合。
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    增广输入列表中的每个序列，添加 n-gram 值
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

In [10]:
ngram_range = 2
max_features = len(vocab)
batch_size = 32
embedding_dims = 50
nb_epoch = 5

In [11]:
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in X_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer. 将 ngram token 映射到独立整数的词典
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    # 整数大小比 max_features 要大，按顺序排列，以避免与已存在的特征冲突
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    # max_features 是可以在数据集中找到的最大的整数
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting X_train and X_test with n-grams features
    # 使用 n-gram 特征增广 X_train 和 X_test
    X_train = add_ngram(X_train, token_indice, ngram_range)
    X_test = add_ngram(X_test, token_indice, ngram_range)
    X_val = add_ngram(X_val, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, X_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, X_test)), dtype=int)))
    print('Average val sequence length: {}'.format(
        np.mean(list(map(len, X_val)), dtype=int)))

Adding 2-gram features
Average train sequence length: 35
Average test sequence length: 34
Average val sequence length: 34


In [12]:
# 填充序列至固定长度
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
X_val = sequence.pad_sequences(X_val, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('X_val shape:', X_val.shape)

Pad sequences (samples x time)
X_train shape: (50000, 40)
X_test shape: (10000, 40)
X_val shape: (5000, 40)


In [13]:
# 构建模型
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# 先从一个高效的嵌入层开始，它将词汇表索引映射到 embedding_dim 维度的向量上
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
# 添加一个 GlobalAveragePooling1D 层，它将平均整个序列的词嵌入
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
# 投影到一个单神经元输出层，然后使用 sigmoid 挤压。
model.add(Dense(10, activation='softmax'))

model.summary()  # 概述

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            11429500  
_________________________________________________________________
global_average_pooling1d_1 ( (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 11,430,010.0
Trainable params: 11,430,010.0
Non-trainable params: 0.0
_________________________________________________________________


In [14]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# 训练与验证
model.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=nb_epoch,
          validation_data=(X_val, Y_val), verbose=2)

Train on 50000 samples, validate on 5000 samples
Epoch 1/5
28s - loss: 1.3702 - acc: 0.6802 - val_loss: 0.7653 - val_acc: 0.7856
Epoch 2/5
26s - loss: 0.4647 - acc: 0.8801 - val_loss: 0.5593 - val_acc: 0.8342
Epoch 3/5
25s - loss: 0.2443 - acc: 0.9373 - val_loss: 0.5113 - val_acc: 0.8484
Epoch 4/5
24s - loss: 0.1334 - acc: 0.9688 - val_loss: 0.4906 - val_acc: 0.8516
Epoch 5/5
24s - loss: 0.0724 - acc: 0.9844 - val_loss: 0.5023 - val_acc: 0.8520


<keras.callbacks.History at 0x25d34250eb8>

In [16]:
score, acc = model.evaluate(X_test, Y_test, batch_size=64, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.513821732652
Test accuracy: 0.8472
