In [21]:
from collections import Counter
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [2]:
def read_titles(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        labels = []
        titles = []
        for line in f:
            labels.append(line.split('\t')[0])
            titles.append(line.split('\t')[1])
    return titles, labels

In [3]:
titles_train, labels_train= read_titles('../train.txt')
titles_val, labels_val= read_titles('../val.txt')
titles_test, labels_test= read_titles('../test.txt')

In [4]:
print(len(titles_train), len(labels_train))
print(len(titles_val), len(labels_val))
print(len(titles_test), len(labels_test))

50000 50000
5000 5000
10000 10000


In [5]:
text = ""
for line in titles_train + titles_val + titles_test:
    text += line
text = text.replace('\u3000', '').replace('\t', '').replace('\n', '')

counter = Counter(list(text))
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)

In [6]:
vocab = words
word_to_index = dict((c, i) for i, c in enumerate(vocab))
index_to_word = dict((i, c) for i, c in enumerate(vocab))

In [7]:
labels_set = set(labels_val)
label_to_index = dict((c, i) for i, c in enumerate(labels_set))
index_to_label = dict((i, c) for i, c in enumerate(labels_set))

In [28]:
def vectorization(titles, labels):
    X = []
    Y = np.zeros((len(titles), len(labels_set)))
    for i in range(len(titles)):
        X.append([word_to_index[x] for x in titles[i] if x in word_to_index])
        Y[i][label_to_index[labels[i]]] = 1
    return X, Y

In [29]:
X_train, Y_train = vectorization(titles_train, labels_train)
X_test, Y_test = vectorization(titles_test, labels_test)
X_val, Y_val = vectorization(titles_val, labels_val)
maxlen = max(map(len, X_train + X_test + X_val))

X_train = sequence.pad_sequences(X_train, maxlen);
X_test = sequence.pad_sequences(X_test, maxlen);
X_val = sequence.pad_sequences(X_val, maxlen);

In [30]:
model = Sequential()
model.add(Embedding(len(vocab), 128))
model.add(LSTM(128))
model.add(Dense(len(labels_set), activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         568192    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
Total params: 701,066.0
Trainable params: 701,066
Non-trainable params: 0.0
_________________________________________________________________


In [31]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, batch_size=64, epochs=20, validation_data=(X_test, Y_test), verbose=2)

Train on 50000 samples, validate on 10000 samples
Epoch 1/20