In [17]:
from collections import Counter
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [18]:
def read_titles(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        labels = []
        titles = []
        for line in f:
            labels.append(line.split('\t')[0])
            titles.append(line.split('\t')[1])
    return titles, labels

In [19]:
titles_train, labels_train= read_titles('../train.txt')
titles_val, labels_val= read_titles('../val.txt')
titles_test, labels_test= read_titles('../test.txt')

In [20]:
print(len(titles_train), len(labels_train))
print(len(titles_val), len(labels_val))
print(len(titles_test), len(labels_test))

50000 50000
5000 5000
10000 10000


In [21]:
text = ""
for line in titles_train + titles_val + titles_test:
    text += line
text = text.replace('\u3000', '').replace('\t', '').replace('\n', '')

counter = Counter(list(text))
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)

In [22]:
vocab = words
word_to_index = dict((c, i) for i, c in enumerate(vocab))
index_to_word = dict((i, c) for i, c in enumerate(vocab))

In [23]:
labels_set = set(labels_val)
label_to_index = dict((c, i) for i, c in enumerate(labels_set))
index_to_label = dict((i, c) for i, c in enumerate(labels_set))

In [24]:
def vectorization(titles, labels):
    X = []
    Y = np.zeros((len(titles), len(labels_set)))
    for i in range(len(titles)):
        X.append([word_to_index[x] for x in titles[i] if x in word_to_index])
        Y[i][label_to_index[labels[i]]] = 1
    return X, Y

In [25]:
X_train, Y_train = vectorization(titles_train, labels_train)
X_test, Y_test = vectorization(titles_test, labels_test)
X_val, Y_val = vectorization(titles_val, labels_val)
maxlen = max(map(len, X_train + X_test + X_val))

X_train = sequence.pad_sequences(X_train, maxlen);
X_test = sequence.pad_sequences(X_test, maxlen);
X_val = sequence.pad_sequences(X_val, maxlen);

In [26]:
def data_shuffle(X, Y):
    arr = np.arange(len(X))
    np.random.shuffle(arr)
    X = X[arr]
    Y = Y[arr]
    return X, Y

X_train, Y_train = data_shuffle(X_train, Y_train)
X_test, Y_test = data_shuffle(X_test, Y_test)
X_val, Y_val = data_shuffle(X_val, Y_val)

In [32]:
model = Sequential()
model.add(Embedding(len(vocab), 128))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(labels_set), activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 128)         568192    
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 128)         131584    
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
Total params: 832,650.0
Trainable params: 832,650
Non-trainable params: 0.0
_________________________________________________________________


In [33]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [34]:
model.fit(X_train, Y_train, batch_size=128, epochs=20, validation_data=(X_val, Y_val), verbose=2)

Train on 50000 samples, validate on 5000 samples
Epoch 1/20
38s - loss: 1.1160 - acc: 0.6285 - val_loss: 0.8474 - val_acc: 0.7290
Epoch 2/20
37s - loss: 0.6677 - acc: 0.7919 - val_loss: 0.7424 - val_acc: 0.7628
Epoch 3/20
37s - loss: 0.5310 - acc: 0.8340 - val_loss: 0.7139 - val_acc: 0.7694
Epoch 4/20
37s - loss: 0.4575 - acc: 0.8564 - val_loss: 0.7326 - val_acc: 0.7804
Epoch 5/20
37s - loss: 0.3993 - acc: 0.8746 - val_loss: 0.7005 - val_acc: 0.7854
Epoch 6/20
37s - loss: 0.3544 - acc: 0.8883 - val_loss: 0.7662 - val_acc: 0.7806
Epoch 7/20
37s - loss: 0.3167 - acc: 0.8995 - val_loss: 0.7432 - val_acc: 0.7892
Epoch 8/20
37s - loss: 0.2805 - acc: 0.9133 - val_loss: 0.7642 - val_acc: 0.7902
Epoch 9/20
37s - loss: 0.2515 - acc: 0.9213 - val_loss: 0.8155 - val_acc: 0.7910
Epoch 10/20
37s - loss: 0.2247 - acc: 0.9306 - val_loss: 0.8351 - val_acc: 0.7976
Epoch 11/20
37s - loss: 0.2014 - acc: 0.9373 - val_loss: 0.8351 - val_acc: 0.7974
Epoch 12/20
38s - loss: 0.1800 - acc: 0.9447 - val_loss: 0

<keras.callbacks.History at 0x2b3fa975fd0>

In [30]:
score, acc = model.evaluate(X_test, Y_test, batch_size=64, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 1.24192916412
Test accuracy: 0.7879
