# LSTM

## Load imports.

In [8]:
# Make common scripts visible
import sys
sys.path.append('../common/')

import numpy as np
from keras.utils import to_categorical

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize, lowercase_all_capital_words
from function_executor import apply_fn_to_list_items_in_dict
from conversion import convert_dictionary_to_array
from word_embeddings import DocToIntSequenceConverter

## Useful lookup tables.

In [9]:
topic_code_to_int = {
    'brexit': 0,
    'gaza': 1,
    'fake_news': 2,
    'hurricane_harvey': 3,
    'winter_olympics': 4,
    'climate_change': 5
}

## Load the data.

In [3]:
tweets_keyed_by_topic = load_data(10000, 'data/')
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = False,
                                                               lemmatize = False)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               lowercase_all_capital_words)

In [4]:
np.random.seed(42)

# Split data into 60% train, 20% validation, 20% test
x, y = convert_dictionary_to_array(tweets_keyed_by_topic_cleaned, topic_code_to_int)
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:split_point_1]
train_y = y[:split_point_1]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

Convert each tweet to a sequence of integers, a more succinct representation of each word.  Will later be used for word embedding lookup.

In [5]:
# Find the length of a tweet in words
tweet_lengths = np.array([len(tweet.split()) for tweet in train_x])

print('Minimum length of tweet in words: {}'.format(np.min(tweet_lengths)))
print('Maximum length of tweet in words: {}'.format(np.max(tweet_lengths)))
print('Mean length of tweet in words: {:.4f}'.format(np.mean(tweet_lengths)))
print('St dev of length of tweet in words: {:.4f}'.format(np.std(tweet_lengths)))

# Set the max sequence length to mean plus 3 standard deviations (99.7% confidence)
max_sequence_length = int(np.mean(tweet_lengths) + np.std(tweet_lengths)*3)

# Confirm not many tweets exceed this limit
tweets_exceeding_limit = [tweet for tweet in train_x if len(tweet.split()) > max_sequence_length]
percentage_tweets_exceeding_limit = (len(tweets_exceeding_limit)/len(train_x))*100
print('Percentage of tweets exceeding max sequence length limit: {:.4f}%'.format(percentage_tweets_exceeding_limit))

Minimum length of tweet in words: 0
Maximum length of tweet in words: 155
Mean length of tweet in words: 11.2929
St dev of length of tweet in words: 8.4298
Percentage of tweets exceeding max sequence length limit: 0.5833%


In [15]:
# Convert tweets to sequence of integers representing the words
tweet_to_int_seq_converter = DocToIntSequenceConverter(train_x, max_sequence_length)
train_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(test_x)

# Convert labels into one hot encoding for use with a neural network
train_y_cat = to_categorical(train_y)
val_y_cat = to_categorical(val_y)
test_y_cat = to_categorical(test_y)