# LSTM

## Load imports.

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from gensim.models import KeyedVectors
from keras.utils import to_categorical
import numpy as np

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize, lowercase_all_capital_words
from function_executor import apply_fn_to_list_items_in_dict
from conversion import convert_dictionary_to_array
from word_embeddings import DocToIntSequenceConverter
from lstm import LstmPredictor

Using TensorFlow backend.


## Useful lookup tables.

In [2]:
topic_code_to_int = {
    'brexit': 0,
    'gaza': 1,
    'fake_news': 2,
    'hurricane_harvey': 3,
    'winter_olympics': 4,
    'climate_change': 5
}

## Load the data.

In [3]:
tweets_keyed_by_topic = load_data(100000, 'data/')
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = False,
                                                               lemmatize = False)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               lowercase_all_capital_words)

In [4]:
np.random.seed(42)

# Split data into 60% train, 20% validation, 20% test
x, y = convert_dictionary_to_array(tweets_keyed_by_topic_cleaned, topic_code_to_int)
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:split_point_1]
train_y = y[:split_point_1]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

Convert each tweet to a sequence of integers, a more succinct representation of each word.  Will later be used for word embedding lookup.

In [5]:
# Find the length of a tweet in words
tweet_lengths = np.array([len(tweet.split()) for tweet in train_x])

print('Minimum length of tweet in words: {}'.format(np.min(tweet_lengths)))
print('Maximum length of tweet in words: {}'.format(np.max(tweet_lengths)))
print('Mean length of tweet in words: {:.4f}'.format(np.mean(tweet_lengths)))
print('St dev of length of tweet in words: {:.4f}'.format(np.std(tweet_lengths)))

# Set the max sequence length to mean plus 3 standard deviations (99.7% confidence)
max_sequence_length = int(np.mean(tweet_lengths) + np.std(tweet_lengths)*3)

# Confirm not many tweets exceed this limit
tweets_exceeding_limit = [tweet for tweet in train_x if len(tweet.split()) > max_sequence_length]
percentage_tweets_exceeding_limit = (len(tweets_exceeding_limit)/len(train_x))*100
print('Percentage of tweets exceeding max sequence length limit: {:.4f}%'.format(percentage_tweets_exceeding_limit))

Minimum length of tweet in words: 0
Maximum length of tweet in words: 183
Mean length of tweet in words: 11.0393
St dev of length of tweet in words: 7.6001
Percentage of tweets exceeding max sequence length limit: 0.6367%


In [6]:
# Convert tweets to sequence of integers representing the words
tweet_to_int_seq_converter = DocToIntSequenceConverter(train_x, max_sequence_length)
train_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = tweet_to_int_seq_converter.convert_to_integer_sequences(test_x)

# Convert labels into one hot encoding for use with a neural network
train_y_cat = to_categorical(train_y)
val_y_cat = to_categorical(val_y)
test_y_cat = to_categorical(test_y)

## Create the LSTM and train it.

First load word embeddings.

In [7]:
word_embedding_dim = 300
word2vec_model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Now initialise the LSTM and train it.

In [8]:
lstm = LstmPredictor(tweet_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(topic_code_to_int.keys()))
lstm.train(train_x_seq, train_y_cat, val_x_seq, val_y_cat)

Train on 360000 samples, validate on 120000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20

KeyboardInterrupt: 

In [9]:
test_predictions = lstm.model.predict(test_x_seq)

In [10]:
test_y_predict = np.argmax(test_predictions, axis=1)

from sklearn.metrics import classification_report

print(classification_report(test_y, test_y_predict, digits=6, target_names=topic_code_to_int.keys()))

                  precision    recall  f1-score   support

          brexit   0.854121  0.893144  0.873197     20027
            gaza   0.929034  0.903617  0.916149     19848
       fake_news   0.739165  0.785512  0.761634     20127
hurricane_harvey   0.871741  0.865108  0.868412     19979
 winter_olympics   0.865557  0.840545  0.852867     20037
  climate_change   0.822789  0.785607  0.803768     19982

       micro avg   0.845467  0.845467  0.845467    120000
       macro avg   0.847068  0.845589  0.846005    120000
    weighted avg   0.846856  0.845467  0.845837    120000



In [21]:
incorrect = []
for i in range(len(test_y_predict)):
    if not np.array_equal(test_y_predict[i], test_y[i]):
        incorrect.append((test_x[i], test_y_predict[i], test_y[i]))

i = 0
for tweet in incorrect:
    print('{} - predicted {} - actual {}'.format(tweet[0], tweet[1], tweet[2]))
    i += 1

one job Steve News Shouldnt taken😂😂 - predicted 2 - actual 3
Wait minute polls Maga 🚂💨❤🇺🇸☀ - predicted 0 - actual 2
important looking fraudulent claims way proof damage - predicted 2 - actual 3
Getting ready - predicted 3 - actual 4
think time switch hepex back Snow hashtag - predicted 2 - actual 4
latest Future Media Change Daily - predicted 4 - actual 2
Dr Terry Fry Sometimes Firstin Human break heart thank work - predicted 2 - actual 3
Donald j. Trump much winning still waiting - predicted 2 - actual 5
Dr. Osman Benchikh Discusses May Affect - predicted 0 - actual 5
Science Friday melts Climate Resist MarchForScience - predicted 5 - actual 2
Peter Daou Expect see lots |s - predicted 5 - actual 3
Trump care much touching blue state - predicted 5 - actual 3
Sky News Australia Malcolm Turnbull peace process referring one border ridiculous - predicted 5 - actual 1
Sometimes complain arranging bed snoring husband brother cats big lady dog 💞 - predicted 3 - actual 2
well done 👏 geospiza -

Yup kind afraid - predicted 2 - actual 3
Per piece economy stupid anyone proclaiming pro-business tending climaterisk Thanks Beneficial State Fdn - predicted 0 - actual 5
2017 part Global Climate Emergency need take action address - predicted 5 - actual 3
Disasters opportunity witness many people twitter utterly illiterate - predicted 5 - actual 3
hype Tournament Mode beta coming - predicted 3 - actual 1
Nat Muller nang although remiss point film mostly skips one western arts residency another disjunction now/then/West/ granted part point yet reframing times art-world specific therefore solipsistic ways - predicted 5 - actual 1
Another praying mantis. real 🤦 - predicted 3 - actual 5
latest DCWonderwoman Daily Digest - predicted 4 - actual 3
🙄🙄🙄 praying 😂 - predicted 3 - actual 2
beautiful 2017 - predicted 4 - actual 3
nurse front lines - predicted 2 - actual 3
Oil gas rigs help protect sea creatures threatened habitat loss study led GeociencesEd suggests J Murray Roberts - predicted 3 

right - predicted 4 - actual 2
Beyond sick beyond disturbing - predicted 1 - actual 5
latest Selfish Meme Daily - predicted 4 - actual 1
Luke Springthorpe like cancer Luke Everything nothing cause add list - predicted 5 - actual 0
david maraniss demonstrating inability report objectively unintentionally awakening audiences - predicted 1 - actual 2
Jon Richard Tunnicliffe love irony hypocrisy - predicted 1 - actual 0
Kill Euro via Live Trading News - predicted 1 - actual 0
latest Nattha Komolvadhin Daily Thanks Al-Monitor yo•ware Chu_SpringNews sgnews Nattha ThaiPBS via Convey - predicted 5 - actual 4
Tragic - predicted 1 - actual 5
need stronglu - predicted 0 - actual 5
close Sen. Mark Warner r-va Trump Russia framing - predicted 2 - actual 1
Looking snaps well glad n't go heb left dry lol 🤦🏻‍♀️ much people.. why.. - predicted 5 - actual 3
Currently - predicted 1 - actual 3
planet ai n't fucked enough - predicted 5 - actual 0
Bild police investigation found conclusively mob alleged sex

many books little time Accordingly venture opinion entitled one - predicted 0 - actual 1
Much World finally Waking reality consequences except Trump administration - predicted 2 - actual 5
❤️ - predicted 2 - actual 4
Paul Ryan right 😂😂😂 - predicted 2 - actual 5
Ouch - predicted 3 - actual 1
pope Focused - predicted 2 - actual 5
Comfortably Smug alert says followed start - predicted 4 - actual 2
One stupid fucking thread string responses bloated over-generalizations false equivalencies ever read site - predicted 2 - actual 5
Politicians end believing called journalists sake political mileage worth people trust - predicted 0 - actual 2
think know answer - predicted 2 - actual 0
God sing real thing c_of_e - predicted 0 - actual 2
Norman Finlklestein un abstention Obama changed policy Israeli settlements making war crime - predicted 1 - actual 2
Watching rn - predicted 3 - actual 4
1 4 children need psychosocial support cope trauma wars - predicted 0 - actual 1
lmfaoooooooo - predicted 2 -

cnn Oddly n't accuse Trump racism - predicted 3 - actual 2
Justin Trudeau Liberal Party Broken love til die promise electoral reform Carbon tax fraud bought 2nd hand pipeline Zios treat Canada like rag doll Gaza ISIS terrorists Canada easy false flags coming nato enemies Trump views pussies 💁🍺 - predicted 1 - actual 5
UK health paradox continues might mean Incisive Health blog - predicted 2 - actual 0
poor choice times Norway embrace ethnic heritage /feel guilt like saying wearing cross poor choice times Westboro Baptist Wesboro church uses crosses logic really silly Fake News Katie Dangerfield - predicted 1 - actual 4
Retweeted Reuters Top News Reuters Top News photographer place death - predicted 4 - actual 1
another attempt approach Shucks got found - predicted 4 - actual 2
Trump win first gold medal incompetence - predicted 2 - actual 4
Sorry real downhill race winter features stock market vs. Trump approval ratings make zero first - predicted 2 - actual 4
Stay Safe via Annbone Mal

Shocking - predicted 1 - actual 2
👇👇👇👇 - predicted 2 - actual 5
Truth sometimes tough look like lawsuits rev long term medical research Exactly - predicted 2 - actual 5
Enough said Thank 2017 - predicted 5 - actual 3
served two years AmeriCorps responding sacrifice zones created foreign wars austerity count - predicted 2 - actual 5
dare steal evil Death Star plans must punish rebellion - predicted 0 - actual 2
Miko Peled Hello Sir Blessings People Please see share - predicted 3 - actual 1
greater threat UK EU directives terrorism foreign power invading - predicted 0 - actual 5
two female North Korea agents reportedly assassinated Kim Jong Nam sexy - predicted 4 - actual 2
Really - predicted 2 - actual 5
Markets Live asx flattens wow wes drag .. - predicted 3 - actual 5
Likey - predicted 2 - actual 4
Maddie swallowing part second job seems - predicted 4 - actual 2
Hurricane 2017 globalwarming harden climate Weather - predicted 5 - actual 3
Yess 😍 - predicted 2 - actual 4
surfing interne

Eric definitely real - predicted 5 - actual 2
Must act swiftly N force Call email write letters show protests town hall meetings - predicted 1 - actual 2
crazy much Brooks Smith looks sounds like dad Art Rascon - predicted 2 - actual 3
crazy morning news guy across street staying month ago - predicted 2 - actual 3
coverage - predicted 3 - actual 4
Parents watched Peg Cat kids waiting Love educational tv Thanks pbs - predicted 3 - actual 4
teacher ready Looking forward meeting students mpva learning lots year - predicted 4 - actual 3
smartest black woman ever heard even put race smartest woman heard awhile - predicted 5 - actual 2
Trump willing destabilize entire region keep promise one casino owner - predicted 2 - actual 1
Hey..look Washington Post latest Scott Isakson Murkowski fence per article - predicted 5 - actual 2
bottom 4.5 hole dug barrel - predicted 3 - actual 2
latest gmd Designs Geek News - predicted 4 - actual 3
fed hearing career politicians abdicated responsibility peopl

## Make predictions and report classification accuracy.

In [None]:
# Re-initialise the LSTM, will use weights from the previous training run.
lstm = LstmPredictor(tweet_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(topic_code_to_int.keys()),
                     use_saved_weights=True)