# LSTM

## Load imports.

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from gensim.models import KeyedVectors
import numpy as np

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize, lowercase_all_capital_words
from function_executor import apply_fn_to_list_items_in_dict
from conversion import convert_dictionary_to_array
from word_embeddings import DocToIntSequenceConverter
from lstm import LstmPredictor
from sklearn.metrics import classification_report

Using TensorFlow backend.


## Useful lookup tables.

In [2]:
topic_code_to_int = {
    'brexit': 0,
    'gaza': 1,
    'fake_news': 2,
    'hurricane_harvey': 3,
    'winter_olympics': 4,
    'climate_change': 5
}

## Load the data.

In [3]:
tweets_keyed_by_topic = load_data(1000, 'data/')
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = False,
                                                               lemmatize = False)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               lowercase_all_capital_words)

In [4]:
np.random.seed(42)

# Split data into 60% train, 20% validation, 20% test
x, y = convert_dictionary_to_array(tweets_keyed_by_topic_cleaned, topic_code_to_int)
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:split_point_1]
train_y = y[:split_point_1]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

Convert each tweet to a sequence of integers, a more succinct representation of each word.  Will later be used for word embedding lookup.

In [5]:
# Find the length of a tweet in words
tweet_lengths = np.array([len(tweet.split()) for tweet in train_x])

print('Minimum length of tweet in words: {}'.format(np.min(tweet_lengths)))
print('Maximum length of tweet in words: {}'.format(np.max(tweet_lengths)))
print('Mean length of tweet in words: {:.4f}'.format(np.mean(tweet_lengths)))
print('St dev of length of tweet in words: {:.4f}'.format(np.std(tweet_lengths)))

# Set the max sequence length to mean plus 3 standard deviations (99.7% confidence)
max_sequence_length = int(np.mean(tweet_lengths) + np.std(tweet_lengths)*3)

# Confirm not many tweets exceed this limit
tweets_exceeding_limit = [tweet for tweet in train_x if len(tweet.split()) > max_sequence_length]
percentage_tweets_exceeding_limit = (len(tweets_exceeding_limit)/len(train_x))*100
print('Percentage of tweets exceeding max sequence length limit: {:.4f}%'.format(percentage_tweets_exceeding_limit))

Minimum length of tweet in words: 0
Maximum length of tweet in words: 159
Mean length of tweet in words: 11.0844
St dev of length of tweet in words: 8.6897
Percentage of tweets exceeding max sequence length limit: 0.4722%


In [6]:
# Convert tweets to sequence of integers representing the words
tweet_to_int_seq_converter = DocToIntSequenceConverter(train_x, max_sequence_length)
train_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(test_x)

## Create the LSTM and train it.

First load word embeddings.

In [7]:
word_embedding_dim = 300
word2vec_model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Now initialise the LSTM and train it.

In [8]:
lstm = LstmPredictor(tweet_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(topic_code_to_int.keys()))
lstm.train(train_x_seq, train_y, val_x_seq, val_y)

Train on 3600 samples, validate on 1200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


## Make predictions and report classification accuracy.

In [12]:
# Re-initialise the LSTM, will use weights from the previous training run.
lstm = LstmPredictor(tweet_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(topic_code_to_int.keys()),
                     use_saved_weights=True)
test_y_predict = lstm.predict(test_x_seq)
print(classification_report(test_y, test_y_predict, digits=6, target_names=topic_code_to_int.keys()))

                  precision    recall  f1-score   support

          brexit   0.756477  0.789189  0.772487       185
            gaza   0.888268  0.854839  0.871233       186
       fake_news   0.687225  0.753623  0.718894       207
hurricane_harvey   0.853659  0.845411  0.849515       207
 winter_olympics   0.680000  0.837438  0.750552       203
  climate_change   0.849315  0.584906  0.692737       212

       micro avg   0.775000  0.775000  0.775000      1200
       macro avg   0.785824  0.777568  0.775903      1200
    weighted avg   0.785186  0.775000  0.774035      1200

