# LSTM

## Load imports.

In [9]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from gensim.models import KeyedVectors
import numpy as np

from loader import load_preprocessed_data
from word_embeddings import DocToIntSequenceConverter
from lstm import LstmPredictor
from lookup_tables import int_to_topic_code, topic_code_to_topic_dict, topic_code_to_int
from sklearn.metrics import classification_report

## Load the data

In [2]:
x, y = load_preprocessed_data('data/rcv1_no_stopwords.csv')
x = np.array(x)
y = np.array(y)

# Split data into 60% train, 20% validation, 20% test
total_examples = len(y)

split_point_1 = int(total_examples * 0.6)
split_point_2 = int(total_examples * 0.8)

train_x = x[:split_point_1]
train_y = y[:split_point_1]

val_x = x[split_point_1:split_point_2]
val_y = y[split_point_1:split_point_2]

test_x = x[split_point_2:]
test_y = y[split_point_2:]

print('Number of training examples: {}'.format(len(train_x)))

Number of training examples: 54366


## Pre-process the data

In [3]:
# Find the length of a tweet in words
article_lengths = np.array([len(article.split()) for article in train_x])

print('Minimum length of article in words: {}'.format(np.min(article_lengths)))
print('Maximum length of article in words: {}'.format(np.max(article_lengths)))
print('Mean length of article in words: {:.4f}'.format(np.mean(article_lengths)))
print('St dev of length of article in words: {:.4f}'.format(np.std(article_lengths)))

# Set the max sequence length to mean plus 3 standard deviations (99.7% confidence)
max_sequence_length = int(np.mean(article_lengths) + np.std(article_lengths)*3)

# Confirm not many tweets exceed this limit
articles_exceeding_limit = [article for article in train_x if len(article.split()) > max_sequence_length]
percentage_articles_exceeding_limit = (len(articles_exceeding_limit)/len(train_x))*100
print('Percentage of articles exceeding max sequence length limit: {:.4f}%'.format(percentage_articles_exceeding_limit))

Minimum length of article in words: 3
Maximum length of article in words: 2392
Mean length of article in words: 109.1119
St dev of length of article in words: 76.5720
Percentage of articles exceeding max sequence length limit: 0.5831%


In [4]:
# Convert articles to sequence of integers representing the words
article_to_int_seq_converter = DocToIntSequenceConverter(train_x, max_sequence_length)
train_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(train_x)
val_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(val_x)
test_x_seq = article_to_int_seq_converter.convert_to_integer_sequences(test_x)

## Create the LSTM and train it.

In [5]:
word_embedding_dim = 300
word2vec_model = KeyedVectors.load_word2vec_format('../tweet-classification/embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [6]:
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(int_to_topic_code.values()))
lstm.train(train_x_seq, train_y, val_x_seq, val_y)

Train on 54366 samples, validate on 18122 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


## Make predictions and report classification accuracy.

In [7]:
# Re-initialise the LSTM, will use weights from the previous training run.
lstm = LstmPredictor(article_to_int_seq_converter.get_word_index(),
                     word_embedding_dim,
                     max_sequence_length,
                     word2vec_model,
                     len(int_to_topic_code.values()),
                     use_saved_weights=True)
test_y_predict = lstm.predict(test_x_seq)
print(classification_report(test_y, test_y_predict, digits=6, target_names=topic_code_to_topic_dict.values()))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.974196  0.971973  0.973083      6137
  ECONOMIC PERFORMANCE   0.984709  0.966387  0.975462      1666
             ELECTIONS   0.946648  0.973734  0.960000      2132
                HEALTH   0.965553  0.945808  0.955579       978
              RELIGION   0.933333  0.922353  0.927811       425
                SPORTS   0.991315  0.992630  0.991972      6784

             micro avg   0.976824  0.976824  0.976824     18122
             macro avg   0.965959  0.962147  0.963984     18122
          weighted avg   0.976905  0.976824  0.976827     18122



## Find examples where predictions went wrong

In [10]:
for topic_code, index in topic_code_to_int.items():
    topic_subset = test_y_predict[test_y == index]
    topic_subset_incorrect = topic_subset[topic_subset != index]
    document_subset = test_x[test_y == index]
    document_subset = document_subset[topic_subset != index]
    
    print('------ 5 random erroneous predictions for {} ------'.format(topic_code_to_topic_dict[topic_code]))
    print('')
    random_indices = np.random.choice(np.arange(len(topic_subset_incorrect)), 5)
    for index in random_indices:
        print(document_subset[index])
        print('')
        print('Above classified as {}'.format(topic_code_to_topic_dict[int_to_topic_code[topic_subset_incorrect[index]]]))
        print('')
    print('')

------ 5 random erroneous predictions for CRIME, LAW ENFORCEMENT ------

Former_Ecuadoran presidential candidate Jaime_Nebot Christian_Democrat Abdala_Bucaram last interview President_Bucaram slander Rightist leader Nebot populist Bucaram last year election points Expreso newspaper suit president Bucaram official visit Peru tour Nicaragua Panama Peru Bucaram Nebot road construction project port city Guayaquil Bucaram power base mayor courts insult Nebot Bucaram lawyer Lebanese descent Nebot aloud Bucaram wealth Allegations corruption court sent Bucaram self-imposed exile mayor Guayaquil Bucaram champion Ecuador poor dispossessed home Guayaquil suburbs bristles journalists wealth lottery money work come rich Nebot Maria_Veronica_Barreiros Quito_Newsroom_+5932

Above classified as ELECTIONS

Violence armed gangs soccer supporters rise Romania commentators general moral deterioration Balkan country years fall communism major Romanian teams groups unruly supporters wide variety primitive w

------ 5 random erroneous predictions for ELECTIONS ------

governor Pakistan Punjab province provincial parliament political fate chief minister court Governor_Raja_Saroop_Khan Chief_Minister_Manzoor_Ahmad_Wattoo vote confidence 248-seat provincial assembly office official APP news agency Lahore_High_Court Wattoo office months ouster move setback Prime_Minister_Benazir_Bhutto three-judge bench court Wattoo Bhutto recommendation days majority assembly bench order Wattoo vote confidence chief minister Arif_Nakai party Bhutto federal governing coalition power Punjab Saroop_Khan retired army general Bhutto confidant order satisfied Wattoo office confidence majority members provincial assembly confidence vote separate move Punjab Speaker_Mohammad_Hanif_Ramay session legislature no-confidence motion Wattoo Nakai People Democratic Front PDF date vote motion minimum maximum days Political sources Wattoo chances confidence vote appeared slim PDF majority assembly main opposition Pakistan_Musli

------ 5 random erroneous predictions for RELIGION ------

Turkey security apparatus gun tough war attrition beleaguered Islamists Prime_Minister_Necmettin_Erbakan country top prosecutor Vural_Savas case constitutional court Erbakan Islam-based Welfare_Party grounds basic tenets secularist Turkish state prospect court battle Erbakan coalition government respite parliamentary challenge 11-month-old rule clear Welfare focus actions principle secularism constitution country civil war Savas 18-page charge sheet Investor reaction Istanbul stocks percent Legal sources months verdict court case Welfare Erbakan secularist elite army plans limited Islamist reforms NATO member Turkey first Islamist leader modern times last prime minister prosecutor charges simple accusation nothing truth reporters pro-Western generals Erbakan Iran Syria army Turkish Kurd separatist rebels Turkey state-run Anatolian news agency Syrian Iraqi Iranian troops borders Kurdish enclave northern Iraq guerrillas Turkish a