This notebook explores LSTMs for text classification, representing a document by:

* the final state of a LSTM
* the final states of a Bidirectional LSTM
* averaging the outputs of each time step in a BiLSTM
* maxing the outputs of each time step in a BiLSTM

This notebook also focuses on appropriate masking for averaging/max-pooling in padded sequences.

In [1]:
import keras
import numpy as np
from sklearn import preprocessing
from keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D, Lambda, Layer, Multiply, GlobalMaxPooling1D, Conv1D, Concatenate, Dropout, LSTM, Bidirectional
from keras.models import Model, Sequential
from keras import backend as K
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback

Using TensorFlow backend.


In [2]:
def load_embeddings(filename, max_vocab_size):

    vocab={}
    embeddings=[]
    with open(filename) as file:
        
        cols=file.readline().split(" ")
        num_words=int(cols[0])
        size=int(cols[1])
        embeddings.append(np.zeros(size))  # 0 = 0 padding if needed
        embeddings.append(np.zeros(size))  # 1 = UNK
        vocab["_0_"]=0
        vocab["_UNK_"]=1
        
        for idx,line in enumerate(file):

            if idx+2 >= max_vocab_size:
                break

            cols=line.rstrip().split(" ")
            val=np.array(cols[1:])
            word=cols[0]
            
            embeddings.append(val)
            vocab[word]=idx+2

    return np.array(embeddings), vocab

In [3]:
def read_data(filename, vocab):
    X=[]
    Y=[]
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            label=cols[0]
            # assumes text is already tokenized
            text=cols[1].split(" ")
            X.append(cols[1])
            Y.append(label)
    return X, Y

In [4]:
def get_word_ids(docs, vocab, max_length=200):
    
    doc_ids=[]
    
    for doc in docs:
        wids=[]

        for token in doc[:max_length]:
            val = vocab[token.lower()] if token.lower() in vocab else 1
            wids.append(val)
        
        # pad each document to constant width
        for i in range(len(wids),max_length):
            wids.append(0)

        doc_ids.append(wids)

    return np.array(doc_ids)

In [7]:
embeddings, vocab=load_embeddings("../data/glove.42B.300d.50K.w2v.txt", 100000)

In [8]:
# Change this to the directory with your data (from the CheckData_TODO.ipynb exercise).  
# The directory should contain train.tsv, dev.tsv and test.tsv
directory="../data/text_classification_sample_data"

In [9]:
trainText, trainY=read_data("%s/train.tsv" % directory, vocab)
devText, devY=read_data("%s/dev.tsv" % directory, vocab)

In [10]:
trainX = get_word_ids(trainText, vocab, max_length=200)
devX = get_word_ids(devText, vocab, max_length=200)

In [11]:
le = preprocessing.LabelEncoder()
le.fit(trainY)
Y_train=np.array(le.transform(trainY))
Y_dev=np.array(le.transform(devY))

In [12]:
def train(model):
    print (model.summary())
    model.fit(trainX, Y_train, 
                validation_data=(devX, Y_dev),
                epochs=30, batch_size=32)

First we'll train a simple LSTM and represent the document by the summary vector output by the final state.

In [13]:
def get_simple_lstm(embeddings, lstm_size=25, dropout_rate=0.2):

    vocab_size, word_embedding_dim=embeddings.shape
    
    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    mask_zero=True,
                                    trainable=False)

    
    embedded_sequences = word_embedding_layer(word_sequence_input)
    
    lstm = LSTM(lstm_size, return_sequences=False, activation='tanh', dropout=dropout_rate)(embedded_sequences)
  
    predictions=Dense(1, activation="sigmoid")(lstm)

    model = Model(inputs=word_sequence_input, outputs=predictions)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

In [14]:
train(get_simple_lstm(embeddings, lstm_size=25, dropout_rate=0.2))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 300)         15000600  
_________________________________________________________________
lstm_1 (LSTM)                (None, 25)                32600     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 26        
Total params: 15,033,226
Trainable params: 32,626
Non-trainable params: 15,000,600
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Train on 48 sampl

Next we'll represent a document by two concatenated vectors: the output of the final state of a forward LSTM and the output of the final state of the backward LSTM.

In [15]:
def get_simple_bilstm(embeddings, lstm_size=25, dropout_rate=0.2):

    vocab_size, word_embedding_dim=embeddings.shape

    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    mask_zero=True,
                                    trainable=False)

    
    embedded_sequences = word_embedding_layer(word_sequence_input)
    
    bi_lstm = Bidirectional(LSTM(lstm_size, return_sequences=False, activation='tanh', dropout=dropout_rate), merge_mode='concat')(embedded_sequences)
  
    predictions=Dense(1, activation="sigmoid")(bi_lstm)

    model = Model(inputs=word_sequence_input, outputs=predictions)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

In [16]:
train(get_simple_bilstm(embeddings, lstm_size=25, dropout_rate=0.2))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 300)         15000600  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50)                65200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 15,065,851
Trainable params: 65,251
Non-trainable params: 15,000,600
_________________________________________________________________
None
Train on 48 samples, validate on 6 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Ep

The final state is often a poor representation of the entire sequence, since it can lose information from the beginning of the sequence.  Let's define a few other layers that can aggregate information across the *entire* sequence, using the information that's output from the LSTM at each time step.  We need to define these custom layers in keras to accomodate zero-padding appropriately (see [here](https://stackoverflow.com/questions/39510809/mean-or-max-pooling-with-masking-support-in-keras/39534110#39534110) for discussion, where these functions originate).

In [17]:
class MaskedAveragePooling1D(Layer):
    def __init__(self, **kwargs):
        self.supports_masking = True
        super(MaskedAveragePooling1D, self).__init__(**kwargs)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.repeat(mask, x.shape[-1])
            mask = tf.transpose(mask, [0,2,1])
            # zero out the elements of x that are masked
            x = x * mask
            
        # sum the modified input, but normalize only over the number of non-masked time steps
        return K.sum(x, axis=1) / K.sum(mask, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])

In [18]:
class MaskedMaxPooling1D(Layer):
    def __init__(self, **kwargs):
        self.supports_masking = True
        super(MaskedMaxPooling1D, self).__init__(**kwargs)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        if mask is not None:
            # take the logical negation of the mask (all 1s become 0 and 0s become 1s)
            mask=tf.logical_not(mask)
            mask = K.cast(mask, K.floatx())
            mask = K.repeat(mask, x.shape[-1])    
            mask = tf.transpose(mask, [0,2,1])
            
            # subtract a big number from each masked input (so that it won't be the max)
            mask *= 10000
            x = x - mask
        
        # max over the modified input along the temporal dimension
        return K.max(x, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])

Now let's explore the use of these functions to aggregate information over the whole sequence.

In [19]:
def get_bilstm_with_average_pooling(embeddings, lstm_size=25, dropout_rate=0.2):

    vocab_size, word_embedding_dim=embeddings.shape

    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings], 
                                    mask_zero=True,
                                    trainable=False)

    
    embedded_sequences = word_embedding_layer(word_sequence_input)
    
    x = Bidirectional(LSTM(lstm_size, return_sequences=True, activation='tanh', dropout=dropout_rate), merge_mode='concat')(embedded_sequences)
    x=MaskedAveragePooling1D()(x)

    x=Dense(1, activation="sigmoid")(x)

    model = Model(inputs=word_sequence_input, outputs=x)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

In [20]:
train(get_bilstm_with_average_pooling(embeddings, lstm_size=25, dropout_rate=0.2))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 300)         15000600  
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 50)          65200     
_________________________________________________________________
masked_average_pooling1d_1 ( (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51        
Total params: 15,065,851
Trainable params: 65,251
Non-trainable params: 15,000,600
_________________________________________________________________
None
Train on 48 samples, validate on 6 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


In [21]:
def get_bilstm_with_max_pooling(embeddings, lstm_size=25, dropout_rate=0.2):

    vocab_size, word_embedding_dim=embeddings.shape

    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings], 
                                     mask_zero=True,
                                    trainable=False)

    
    embedded_sequences = word_embedding_layer(word_sequence_input)
    
    x = Bidirectional(LSTM(lstm_size, return_sequences=True, activation='tanh', dropout=dropout_rate), merge_mode='concat')(embedded_sequences)
    x=MaskedMaxPooling1D()(x)

    x=Dense(1, activation="sigmoid")(x)

    model = Model(inputs=word_sequence_input, outputs=x)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

In [22]:
train(get_bilstm_with_max_pooling(embeddings, lstm_size=25, dropout_rate=0.2))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, None, 300)         15000600  
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 50)          65200     
_________________________________________________________________
masked_max_pooling1d_1 (Mask (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 15,065,851
Trainable params: 65,251
Non-trainable params: 15,000,600
_________________________________________________________________
None
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 48 samples, validate on 6 samples