# LSTM Bag of Words Investigation

## Load imports.

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from gensim.models import KeyedVectors
import numpy as np

from loader import load_preprocessed_data
from word_embeddings import DocToIntSequenceConverter
from lstm import LstmPredictor
from lookup_tables import int_to_topic_code, topic_code_to_topic_dict, topic_code_to_int
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.


## Load the data

In [2]:
x, y = load_preprocessed_data('data/rcv1_lemmatized.csv')
x = np.array(x)
y = np.array(y)

# Split data into 60% train, 20% validation, 20% test
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:6000]
train_y = y[:6000]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

print('Number of training examples: {}'.format(len(train_x)))
print('Number of validation examples: {}'.format(len(val_x)))
print('Number of test examples: {}'.format(len(test_x)))

Number of training examples: 6000
Number of validation examples: 18122
Number of test examples: 18122


## Pre-process the data

In [3]:
# Find the length of a tweet in words
article_lengths = np.array([len(article.split()) for article in train_x])

print('Minimum length of article in words: {}'.format(np.min(article_lengths)))
print('Maximum length of article in words: {}'.format(np.max(article_lengths)))
print('Mean length of article in words: {:.4f}'.format(np.mean(article_lengths)))
print('St dev of length of article in words: {:.4f}'.format(np.std(article_lengths)))

# Set the max sequence length to mean plus 3 standard deviations (99.7% confidence)
max_sequence_length = int(np.mean(article_lengths) + np.std(article_lengths)*3)

# Confirm not many tweets exceed this limit
articles_exceeding_limit = [article for article in train_x if len(article.split()) > max_sequence_length]
percentage_articles_exceeding_limit = (len(articles_exceeding_limit)/len(train_x))*100
print('Percentage of articles exceeding max sequence length limit: {:.4f}%'.format(percentage_articles_exceeding_limit))

Minimum length of article in words: 6
Maximum length of article in words: 1697
Mean length of article in words: 165.0182
St dev of length of article in words: 112.3258
Percentage of articles exceeding max sequence length limit: 0.6167%


In [4]:
# Convert articles to sequence of integers representing the words
article_to_int_seq_converter = DocToIntSequenceConverter(train_x, max_sequence_length)
train_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(test_x)

## Create the LSTM and train it.

In [None]:
# Create the word embedding model
class EmbeddingModel:
    
    def __init__(self, train_x):
        self.vectoriser = CountVectorizer(lowercase = False, binary = True, ngram_range = (1,1))
        self.vectoriser.fit(train_x)
        
    def get_vector(self, word):
        return self.vectoriser.transform([word]).toarray()[0]
    
    def get_embedding_dim(self):
        return self.vectoriser.transform(['']).shape[1]

embedding_model = EmbeddingModel(train_x)
word_embedding_dim = embedding_model.get_embedding_dim()

In [None]:
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     embedding_model,
                     len(int_to_topic_code.values()))
lstm.train(train_x_seq, train_y, val_x_seq, val_y)

Train on 6000 samples, validate on 18122 samples
Epoch 1/20

## Make predictions and report classification accuracy.

In [None]:
# Re-initialise the LSTM, will use weights from the previous training run.
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     embedding_model,
                     len(int_to_topic_code.values()),
                     use_saved_weights=True)
test_y_predict = lstm.predict(test_x_seq)
print(classification_report(test_y, test_y_predict, digits=6, target_names=topic_code_to_topic_dict.values()))

## Find examples where predictions went wrong

In [None]:
for topic_code, index in topic_code_to_int.items():
    topic_subset = test_y_predict[test_y == index]
    topic_subset_incorrect = topic_subset[topic_subset != index]
    document_subset = test_x[test_y == index]
    document_subset = document_subset[topic_subset != index]
    
    print('------ 5 random erroneous predictions for {} ------'.format(topic_code_to_topic_dict[topic_code]))
    print('')
    random_indices = np.random.choice(np.arange(len(topic_subset_incorrect)), 5)
    for index in random_indices:
        print(document_subset[index])
        print('')
        print('Above classified as {}'.format(topic_code_to_topic_dict[int_to_topic_code[topic_subset_incorrect[index]]]))
        print('')
    print('')