# Train Keras LSTM classifier on Sentiment140 using Google Colab GPUs

In [0]:
from gensim.models import KeyedVectors
import json
import numpy as np
import os
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.optimizers import Adam
from keras.models import Model

Using TensorFlow backend.


## Set some useful variables

In [0]:
DATA_DIR = './'
CLEAN_SENTIMENT140_FILE = 'sentiment140_clean.csv'
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
EMBEDDING_MATRIX_FILE = 'embeddings.bin.npy'
TOKENIZER_FILE = 'tokenizer.json'

MODEL_DIR = './'
MODEL_FILE = 'modelGPU.h5'

In [0]:
MAX_NUM_WORDS = 10000
EMBEDDING_DIM = 300
MAX_SEQ_LENGTH = 140

## Load clean data set

In [0]:
df = pd.read_csv(os.path.join(DATA_DIR, CLEAN_SENTIMENT140_FILE))

Remove tweets for which the cleaned text is empty.

In [0]:
df = df.loc[df['clean_text'].notnull()]

In [0]:
df.head()

Unnamed: 0,target,clean_text
0,1,hey hey what about u and jose umm
1,0,sorry
2,1,glad you had a good time i think we all apprec...
3,1,getting ready to leave for my class trip today...
4,0,im in serious need of ice cream


## Tokenize and pad the sequences

Load Tokenizer object from JSON file.

In [0]:
with open(os.path.join(DATA_DIR, TOKENIZER_FILE), 'r') as file:
  tokenizer = tokenizer_from_json(json.load(file))

Convert the cleaned tweets to sequences of numbers.

In [0]:
tokenized_seqs = tokenizer.texts_to_sequences(df['clean_text'])

How many sequences are using the last word?

In [0]:
len([s for s in tokenized_seqs if MAX_NUM_WORDS-1 in s])

73

Pad sequences.

In [0]:
padded_seqs = pad_sequences(tokenized_seqs, maxlen=MAX_SEQ_LENGTH)

In [0]:
padded_seqs.shape

(1596009, 140)

## Load embedding matrix

In [0]:
embedding_matrix = np.load(os.path.join(DATA_DIR, EMBEDDING_MATRIX_FILE))

In [0]:
embedding_matrix.shape

(10000, 300)

## Create the model

In [0]:
try:
    del model
except:
    print('Model not yet defined.')

In [0]:
inp = Input(shape=(MAX_SEQ_LENGTH,))
x = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.25, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.25)(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)

Compile the model.

In [0]:
adam = Adam(learning_rate=0.001)

In [0]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [0]:
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 140)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 140, 300)          3000000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 140, 200)          320800    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100)               100400    
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 3,431,401
Trainable params: 431,401
Non-trainable params: 3,000,000
___________________________________________

## Split the data into train, dev and test
We will use 96% of data for training, 2% for validation and 2% for testing.

In [0]:
random_numbers = np.random.rand(len(df))

IDX_train = (random_numbers <= 0.96)
IDX_dev = (random_numbers > 0.96) & (random_numbers <= 0.98)
IDX_test = (random_numbers > 0.98)

In [0]:
X_train = padded_seqs[IDX_train, :]
y_train = df.loc[IDX_train, 'target'].values

In [0]:
X_dev = padded_seqs[IDX_dev, :]
y_dev = df.loc[IDX_dev, 'target'].values

In [0]:
X_test = padded_seqs[IDX_test, :]
y_test = df.loc[IDX_test, 'target'].values

## Perform the training

In [0]:
history = model.fit(X_train, y_train, validation_data=(X_dev, y_dev), batch_size=2048, epochs=10)

Train on 1532128 samples, validate on 32082 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
model.evaluate(X_test, y_test)



[0.38169236949653856, 0.8288625478744507]

In [0]:
model.save(os.path.join(MODEL_DIR, MODEL_FILE))