In [1]:
from keras.layers import Input, Dense, LSTM, Embedding, Bidirectional
import numpy as np
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

In [2]:
with open('./datasets/20news_train_raw.txt') as f:
    contents_train = [(content.split('<fff>')[0], \
                 content.split('<fff>')[1], \
                 content.split('<fff>')[2] \
                ) for content in f.read().splitlines()]

with open('./datasets/20news_test_raw.txt') as f:
    contents_test = [(content.split('<fff>')[0], \
                 content.split('<fff>')[1], \
                 content.split('<fff>')[2] \
                ) for content in f.read().splitlines()]
                
with open('./datasets/vocab_raw.txt') as f:
    vocab = f.read().splitlines()

In [3]:
word2id = dict([(word, index+1) for (index, word) in enumerate(vocab)])
id2word = {index: word for word, index in word2id.items()}

In [4]:
def get_pad_data(contents):
    norm_bible = [(label, doc_id, len(words.split()[:500]), words.split()[:SENT_LENGTH]) for label, doc_id, words in contents]

    encoded_data = [[word2id[w] if w in word2id.keys() 
                                else 1 for w in words]
                                for label, doc_id, len_words, words in norm_bible]

    pad_data = pad_sequences(encoded_data, maxlen=SENT_LENGTH, padding='post')
    data_labels = np.array([int(content[0]) for content in contents])
    return pad_data, data_labels

SENT_LENGTH = 500
train_padded_data, train_data_labels = get_pad_data(contents=contents_train)
test_padded_data, test_data_labels = get_pad_data(contents=contents_test)

In [5]:
with open('./datasets/vocab_raw.txt') as f:
    vocab = f.read().splitlines()
with open('./datasets/20news_train_encoded.txt') as f:
    train_data = f.read().splitlines()
    train_padded_data = [contents.split('<fff>')[3].split() for contents in train_data]
    train_padded_data = [[int(ind) for ind in content] for content in train_padded_data]
    train_data_labels = [int(contents.split('<fff>')[0]) for contents in train_data]

with open('./datasets/20news_test_encoded.txt') as f:
    test_data = f.read().splitlines()
    test_padded_data = [contents.split('<fff>')[3].split() for contents in test_data]
    test_padded_data = [[int(ind) for ind in content] for content in test_padded_data]
    test_data_labels = [int(contents.split('<fff>')[0]) for contents in test_data]

In [6]:
vocab_size = len(vocab)
embedding_size = 300
LSTM_size = 50
batch_size = 32
NUM_CLASSES = 20

model = Sequential()
model.add(Input(shape=[SENT_LENGTH]))
model.add(Embedding(vocab_size+1, embedding_size, input_length=SENT_LENGTH))
model.add(Bidirectional(LSTM(LSTM_size)))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_padded_data, train_data_labels, epochs=10, batch_size=batch_size, 
                        validation_data=(test_padded_data, test_data_labels), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a1dd9b4820>