# LSTM

## Load imports.

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from gensim.models import KeyedVectors
import numpy as np

from loader import load_preprocessed_data
from word_embeddings import DocToIntSequenceConverter
from lstm import LstmPredictor
from lookup_tables import int_to_topic_code, topic_code_to_topic_dict
from sklearn.metrics import classification_report

Using TensorFlow backend.


## Load the data

In [None]:
x, y = load_preprocessed_data('data/rcv1_no_stopwords.csv')
x = np.array(x)
y = np.array(y)

# Split data into 60% train, 20% validation, 20% test
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:split_point_1]
train_y = y[:split_point_1]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

print('Number of training examples: {}'.format(len(train_x)))

Number of training examples: 54366


## Pre-process the data

In [None]:
# Find the length of a tweet in words
article_lengths = np.array([len(article.split()) for article in train_x])

print('Minimum length of article in words: {}'.format(np.min(article_lengths)))
print('Maximum length of article in words: {}'.format(np.max(article_lengths)))
print('Mean length of article in words: {:.4f}'.format(np.mean(article_lengths)))
print('St dev of length of article in words: {:.4f}'.format(np.std(article_lengths)))

# Set the max sequence length to mean plus 3 standard deviations (99.7% confidence)
max_sequence_length = int(np.mean(article_lengths) + np.std(article_lengths)*3)

# Confirm not many tweets exceed this limit
articles_exceeding_limit = [article for article in train_x if len(article.split()) > max_sequence_length]
percentage_articles_exceeding_limit = (len(articles_exceeding_limit)/len(train_x))*100
print('Percentage of articles exceeding max sequence length limit: {:.4f}%'.format(percentage_articles_exceeding_limit))

Minimum length of article in words: 3
Maximum length of article in words: 2392
Mean length of article in words: 109.1119
St dev of length of article in words: 76.5720
Percentage of articles exceeding max sequence length limit: 0.5831%


In [None]:
# Convert articles to sequence of integers representing the words
article_to_int_seq_converter = DocToIntSequenceConverter(train_x, max_sequence_length)
train_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(test_x)

## Create the LSTM and train it.

In [None]:
word_embedding_dim = 300
word2vec_model = KeyedVectors.load_word2vec_format('../tweet-classification/embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(int_to_topic_code.values()))
lstm.train(train_x_seq, train_y, val_x_seq, val_y)

Train on 54366 samples, validate on 18122 samples
Epoch 1/20
Epoch 2/20

## Make predictions and report classification accuracy.

In [None]:
# Re-initialise the LSTM, will use weights from the previous training run.
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(int_to_topic_code.values()),
                     use_saved_weights=True)
test_y_predict = lstm.predict(test_x_seq)
print(classification_report(test_y, test_y_predict, digits=6, target_names=topic_code_to_topic_dict.values()))