# Train Keras LSTM classifier on Sentiment140

In [1]:
from gensim.models import KeyedVectors
import json
import numpy as np
import os
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model

Using TensorFlow backend.


## Set some useful variables

In [2]:
DATA_DIR = '../data/'
CLEAN_SENTIMENT140_FILE = 'sentiment140_clean.csv'
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
EMBEDDING_MATRIX_FILE = 'embeddings.bin'

MODEL_DIR = '../models/'
MODEL_FILE = 'modelCPU.h5'
TOKENIZER_FILE = 'tokenizer.json'

In [3]:
MAX_NUM_WORDS = 10000
EMBEDDING_DIM = 300
MAX_SEQ_LENGTH = 140

## Load clean data set

In [4]:
df = pd.read_csv(os.path.join(DATA_DIR, CLEAN_SENTIMENT140_FILE))

Remove tweets for which the cleaned text is empty.

In [5]:
df = df.loc[df['clean_text'].notnull()]

In [6]:
df.head()

Unnamed: 0,target,clean_text
0,1,hey hey what about u and jose umm
1,0,sorry
2,1,glad you had a good time i think we all apprec...
3,1,getting ready to leave for my class trip today...
4,0,im in serious need of ice cream


## Tokenize and pad the sequences

Fit a Tokenizer on the cleaned tweets.

In [7]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='</s>')
tokenizer.fit_on_texts(df['clean_text'])

Save the tokenizer.

In [8]:
with open(os.path.join(DATA_DIR, TOKENIZER_FILE), 'w') as file:
    json.dump(tokenizer.to_json(), file)

Convert the cleaned tweets to sequences of numbers.

In [9]:
tokenized_seqs = tokenizer.texts_to_sequences(df['clean_text'])

How many sequences are using the last word?

In [10]:
len([s for s in tokenized_seqs if MAX_NUM_WORDS-1 in s])

73

Pad sequences.

In [11]:
padded_seqs = pad_sequences(tokenized_seqs, maxlen=MAX_SEQ_LENGTH)

In [12]:
padded_seqs.shape

(1596009, 140)

## Load pre-trained word embeddings from Google News

In [13]:
word2vec = KeyedVectors.load_word2vec_format(os.path.join(DATA_DIR, EMBEDDING_FILE), binary=True)

Find the embeddings correspondig to the words in our vocabulary.

In [14]:
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

for (word, idx) in tokenizer.word_index.items():
    if word in word2vec.vocab and idx < MAX_NUM_WORDS:
        embedding_matrix[idx] = word2vec.word_vec(word)

In [15]:
embedding_matrix.shape

(10000, 300)

Free up the memory occupied by the embeddings.

In [16]:
del word2vec

Save the embedding matrix corresponding to our vocab.

In [17]:
np.save(os.path.join(DATA_DIR, EMBEDDING_MATRIX_FILE), embedding_matrix, allow_pickle=False)

## Create the model

In [18]:
try:
    del model
except:
    print('Model not yet defined.')

Model not yet defined.


In [19]:
inp = Input(shape=(MAX_SEQ_LENGTH,))
x = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.25, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.25)(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)

Compile the model.

In [20]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 140)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 140, 300)          3000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 140, 200)          320800    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101 

## Split the data into train, dev and test
We will use 90% of data for training, 5% for validation and 5% for testing.

In [22]:
random_numbers = np.random.rand(len(df))

IDX_train = (random_numbers <= 0.05)
IDX_dev = (random_numbers > 0.97) & (random_numbers <= 0.98)
IDX_test = (random_numbers > 0.99)

In [23]:
X_train = padded_seqs[IDX_train, :]
y_train = df.loc[IDX_train, 'target'].values

In [24]:
X_dev = padded_seqs[IDX_dev, :]
y_dev = df.loc[IDX_dev, 'target'].values

In [25]:
X_test = padded_seqs[IDX_test, :]
y_test = df.loc[IDX_test, 'target'].values

## Perform the training

In [26]:
history = model.fit(X_train, y_train, validation_data=(X_dev, y_dev), batch_size=1024, epochs=2)

Train on 80114 samples, validate on 15888 samples
Epoch 1/2
Epoch 2/2


Save trained model.

In [28]:
model.save(os.path.join(MODEL_DIR, MODEL_FILE))

## Evaluate model

In [29]:
model.evaluate(X_test, y_test)



[0.4776995744231377, 0.7725104093551636]