# LSTM

## Load imports.

In [4]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from gensim.models import KeyedVectors
import numpy as np

from loader import load_preprocessed_data
from word_embeddings import DocToIntSequenceConverter
from lstm_common import calculate_max_word_length
from lstm_word import LstmPredictor
from lookup_tables import int_to_topic_code, topic_code_to_topic_dict, topic_code_to_int
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

## Load the data

In [5]:
x, y = load_preprocessed_data('data/rcv1_kb.csv')
x = np.array(x)
y = np.array(y)

# Split data into 60% train, 20% validation, 20% test
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:split_point_1]
train_y = y[:split_point_1]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

print('Number of training examples: {}'.format(len(train_x)))

Number of training examples: 54366


## Pre-process the data

In [7]:
max_sequence_length = calculate_max_word_length(train_x)

Minimum length of article in words: 3
Maximum length of article in words: 2449
Mean length of article in words: 109.2608
St dev of length of article in words: 76.6736
Percentage of articles exceeding max sequence length limit: 0.5776%


In [8]:
# Convert articles to sequence of integers representing the words
article_to_int_seq_converter = DocToIntSequenceConverter(x, max_sequence_length)
train_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(test_x)

## Create the LSTM and train it.

In [10]:
word_embedding_dim = 300
word2vec_model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [11]:
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(int_to_topic_code.values()))
class_weights = compute_class_weight('balanced', np.unique(train_y), train_y)
class_weights_dict = {}
for i in range(len(class_weights)):
    class_weights_dict[i] = class_weights[i]
lstm.train(train_x_seq, train_y, val_x_seq, val_y, class_weight=class_weights)

Train on 54366 samples, validate on 18122 samples
Epoch 1/100
 2528/54366 [>.............................] - ETA: 24:53 - loss: 0.9057 - acc: 0.6919

KeyboardInterrupt: 

## Make predictions and report classification accuracy.

In [None]:
# Re-initialise the LSTM, will use weights from the previous training run.
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(int_to_topic_code.values()),
                     use_saved_weights=True)
test_y_predict = lstm.predict(test_x_seq)
print(classification_report(test_y, test_y_predict, digits=6, target_names=topic_code_to_topic_dict.values()))

## Find examples where predictions went wrong

In [None]:
for topic_code, index in topic_code_to_int.items():
    topic_subset = test_y_predict[test_y == index]
    topic_subset_incorrect = topic_subset[topic_subset != index]
    document_subset = test_x[test_y == index]
    document_subset = document_subset[topic_subset != index]
    
    print('------ 5 random erroneous predictions for {} ------'.format(topic_code_to_topic_dict[topic_code]))
    print('')
    random_indices = np.random.choice(np.arange(len(topic_subset_incorrect)), 5)
    for index in random_indices:
        print(document_subset[index])
        print('')
        print('Above classified as {}'.format(topic_code_to_topic_dict[int_to_topic_code[topic_subset_incorrect[index]]]))
        print('')
    print('')