# Repeval 2017 Exporations

In [1]:
from keras import backend as K
K.set_image_dim_ordering('th')

Using Theano backend.
Using gpu device 0: GeForce GTX 1050 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5110)


In [2]:
from gensim.models import KeyedVectors

In [3]:
import numpy as np
import pandas as pd
from keras.models import Model, save_model, load_model
from keras.layers import Input, Embedding, Flatten, Dense
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.layers import merge
from keras.utils.np_utils import to_categorical
import h5py


First, load pre-trained word embeddings. Here, the ones from Mikolov using the word2vec toolkit

In [12]:
W2V_BINARY_PATH = '../data/GoogleNews-vectors-negative300.bin'
vectorspace = KeyedVectors.load_word2vec_format(W2V_BINARY_PATH, binary=True)

Load the data. Starting here with the SNLI Corpus until further data are available for the repeval2017 task.

In [6]:
data_path = '../data/snli_1.0/'
data_frame = pd.read_csv(data_path + 'snli_1.0_train.txt', sep='\t')
gold_labels = data_frame.gold_label.tolist()
sentences1 = data_frame.sentence1.tolist()
sentences2 = data_frame.sentence2.tolist()

Use only part of the data for testing the model:

In [7]:
num_samples = len(gold_labels)
samples = [(str(sentences1[i]), str(sentences2[i]), str(gold_labels[i])) for i in range(num_samples)]
print("Using {} samples from the dataset".format(num_samples))

Using 550152 samples from the dataset


Use h5py to store the data. This helps to use less RAM for the training data in training the models.

## Training a model with some data

Preprocessing the data to include the word embeddings into the model is done after this keras-example:
https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py

In [4]:
MAX_NB_WORDS = 35000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
VALIDATION_SPLIT = 0.2

A method for padding without using numpy, which breaks when building data arrays with 500000 or more embedded vectors.

In [9]:
def simple_padding(sequences, maxlen=100):
    for sequence in sequences:
        while len(sequence) < maxlen:
            sequence.append(0)
        if len(sequence) > maxlen:
            del sequence[maxlen:]
    return sequences

Turn string sequences into integer sequences, pad them to equal length, divide into training and validation data, store using the hdf5 binary format.

In [11]:
str_sentences1 = [sentence for sentence, sentence2, label in samples]
str_sentences2 = [sentence2 for sentence, sentence2, label in samples]
labels = [label for sentence, sentence2, label in samples] 
numeric_labels = []
for label in labels:
    if label == 'neutral':
        numeric_labels.append(0)
    elif label == 'contradiction':
        numeric_labels.append(1)
    elif label == 'entailment':
        numeric_labels.append(2)
    else:
        numeric_labels.append(3)
        
tokenizer = text.Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(str_sentences1 + str_sentences2)
sequences1 = tokenizer.texts_to_sequences(str_sentences1)
sequences2 = tokenizer.texts_to_sequences(str_sentences2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

print("shape of sequences1: {}.".format(np.shape(sequences1)))
print("shape of sequences2: {}.".format(np.shape(sequences2)))

data1 = simple_padding(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
data2 = simple_padding(sequences2, maxlen=MAX_SEQUENCE_LENGTH)

print("shape of data1: {}.".format(np.shape(data1)))
print("shape of data21: {}.".format(np.shape(data2)))

data = np.asarray([data1, data2])

labels = to_categorical(np.asarray(numeric_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[1])
np.random.shuffle(indices)
data = data[:,indices,:]
labels = labels[indices,:]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[1])

x_train = data[:,:-num_validation_samples,:]
y_train = labels[:-num_validation_samples]
x_val = data[:,-num_validation_samples:,:]
y_val = labels[-num_validation_samples:]

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of x_val: {}".format(np.shape(x_val)))
print("Shape of y_val: {}".format(np.shape(y_val)))

print("Storing training and test data to hdf5...")

f = h5py.File('../data/deep_training_data.hdf5', 'a')
if not 'x_train' in f: x_train = f.create_dataset('x_train', data=x_train)
if not 'y_train' in f: y_train = f.create_dataset('y_train', data=y_train)
if not 'x_val' in f: x_val = f.create_dataset('x_val', data=x_val)
if not 'y_val' in f: y_val = f.create_dataset('y_val', data=y_val)
    
print("Shape of x_train: {}".format(np.shape(f['x_train'])))
print("Shape of y_train: {}".format(np.shape(f['y_train'])))
print("Shape of x_val: {}".format(np.shape(f['x_val'])))
print("Shape of y_val: {}".format(np.shape(f['y_val'])))
f.close()
print("Done.")

Found 34369 unique tokens.
shape of sequences1: (550152,).
shape of sequences2: (550152,).
shape of data1: (550152, 100).
shape of data21: (550152, 100).
Shape of data tensor: (2, 550152, 100)
Shape of label tensor: (550152, 4)
Shape of x_train: (2, 440122, 100)
Shape of y_train: (440122, 4)
Shape of x_val: (2, 110030, 100)
Shape of y_val: (110030, 4)
Storing training and test data to hdf5...
Shape of x_train: (2, 440122, 100)
Shape of y_train: (440122, 4)
Shape of x_val: (2, 110030, 100)
Shape of y_val: (110030, 4)
Done.


Compute an embedding matrix (the first layer of later models) and store it for later use.

In [13]:
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM)) #TODO: is this correct? In the example, the first dimension is
# num_words, this, however, throws an error when populating the embedding matrix (because the word-indices start at 1, not at 0)
print("shape of embedding matrix: {}".format(np.shape(embedding_matrix)))

for word, i in word_index.items():
    #print("{}: {}".format(i,word))
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = None
    
    try:
        embedding_vector = vectorspace[word]
    except:
        pass
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) #set trainable = True to enable training of the embeddings to the task at hand

#save as a model for later use
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sentence = embedding_layer(sentence_input)

embedding_model = Model(input=sentence_input, output=embedded_sentence)
embedding_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

print("Saving model...")
save_model(embedding_model, '../data/embedding_layer.hdf5')
print(embedding_layer.input_dim)
print(embedding_layer.output_dim)

Preparing embedding matrix.
shape of embedding matrix: (34370, 300)
Saving model...
34370
300


In [6]:
print('Loading model...')
embedding_model = load_model('../data/embedding_layer.hdf5')
print('Done.')
print(embedding_model.input_shape)
print(embedding_model.output_shape)
print(np.shape(embedding_model.get_weights()))
print(embedding_model.summary())

Loading model...
Done.
(None, 100)
(None, 100, 300)
(1, 34370, 300)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 100, 300)      10311000    input_1[0][0]                    
Total params: 10,311,000
Trainable params: 0
Non-trainable params: 10,311,000
____________________________________________________________________________________________________
None


## Model Architectures

The most simple model for encoding the sentences:

In [20]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#encode first sentence
embedded_sentence1 = embedding_model(sentence1_input)
x = Conv1D(128, 5, activation='relu')(embedded_sentence1)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence1 = MaxPooling1D(35)(x)

#encode second sentence
embedded_sentence2 = embedding_layer(sentence2_input)
y = Conv1D(128, 5, activation='relu')(embedded_sentence2)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
encoded_sentence2 = MaxPooling1D(35)(y)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [encoded_sentence1, encoded_sentence2], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model1 = Model(input=[sentence1_input, sentence2_input], output=preds)
model1.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [21]:
model1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 300)     2482200     input_11[0][0]                   
                                                                   input_12[0][0]                   
____________________________________________________________________________________________________
convolution1d_10 (Convolution1D) (None, 996, 128)      192128      embedding_1[3][0]       

Nearly the same model but using a shared architecture for embedding both sentences, so that it will profit from all examples:

In [8]:
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_model(sentence_input)
x = Conv1D(128, 3, activation='relu')(sentence_embedding)
x = MaxPooling1D(4)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
encoded_sentence = MaxPooling1D(5)(x)

sentence_embedding_model = Model(input=sentence_input, output=encoded_sentence)

sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence1_embedding = sentence_embedding_model(sentence1_input)
sentence2_embedding = sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
conv_model = Model(input=[sentence1_input, sentence2_input], output=preds)
conv_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [9]:
sentence_embedding_model.summary()
conv_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 100)           0                                            
____________________________________________________________________________________________________
model_1 (Model)                  (None, 100, 300)      10311000    input_4[0][0]                    
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 98, 128)       115328      model_1[2][0]                    
____________________________________________________________________________________________________
maxpooling1d_4 (MaxPooling1D)    (None, 24, 128)       0           convolution1d_4[0][0]            
___________________________________________________________________________________________

Question for later: why doesn't convolution change the output size? Shouldnt it go down from to 100 to 98? -> solved, was because of bordermode = "same".

## Training the model with the snli dataset

In [11]:
def training_data_generator(x_train, y_train, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_train[0,i:i+batch_size,:],x_train[1,i:i+batch_size,:]], y_train[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

def val_data_generator(x_test, y_test, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_test[0,i:i+batch_size,:],x_test[1,i:i+batch_size,:]], y_test[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

In [15]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
x_train = f['x_train']
y_train = f['y_train']
x_val = f['x_val']
y_val = f['y_val']
print("Done.")

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of x_val: {}".format(np.shape(x_val)))
print("Shape of y_val: {}".format(np.shape(y_val)))
       
batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of steps until epoch is finished
nb_val_samples = int(np.floor(np.shape(y_val)[0] / batch_size))

print("Training the model...")

conv_model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          samples_per_epoch = samples_per_epoch * batch_size,
          validation_data=val_data_generator(x_val, y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples * batch_size,
          verbose=2)
print("Done.")

f.close()

Loading training and validation data...
Done.
Shape of x_train: (2, 440122, 100)
Shape of y_train: (440122, 4)
Shape of x_val: (2, 110030, 100)
Shape of y_val: (110030, 4)
Training the model...
Epoch 1/10
144s - loss: 0.8143 - acc: 0.6347 - val_loss: 0.7518 - val_acc: 0.6797
Epoch 2/10
145s - loss: 0.6988 - acc: 0.7069 - val_loss: 0.6953 - val_acc: 0.7106
Epoch 3/10
144s - loss: 0.6453 - acc: 0.7345 - val_loss: 0.6708 - val_acc: 0.7247
Epoch 4/10
142s - loss: 0.6071 - acc: 0.7545 - val_loss: 0.6635 - val_acc: 0.7313
Epoch 5/10
142s - loss: 0.5786 - acc: 0.7683 - val_loss: 0.6656 - val_acc: 0.7322
Epoch 6/10
143s - loss: 0.5555 - acc: 0.7795 - val_loss: 0.6682 - val_acc: 0.7335
Epoch 7/10
142s - loss: 0.5356 - acc: 0.7883 - val_loss: 0.6779 - val_acc: 0.7319
Epoch 8/10
143s - loss: 0.5189 - acc: 0.7954 - val_loss: 0.6967 - val_acc: 0.7319
Epoch 9/10
142s - loss: 0.5037 - acc: 0.8032 - val_loss: 0.7092 - val_acc: 0.7293
Epoch 10/10
141s - loss: 0.4901 - acc: 0.8094 - val_loss: 0.7231 - v

In [16]:
conv_model.save_weights('../data/conv_model_weights.hdf5')

## Recurrent Models

In [19]:
from keras.layers import GRU
from keras.optimizers import Adam

sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_model(sentence_input)
encoded_sentence = GRU(128, activation='relu')(sentence_embedding)
#TODO: stack lstms here
lstm_sentence_embedding_model = Model(input=sentence_input, output=encoded_sentence)

sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence1_embedding = lstm_sentence_embedding_model(sentence1_input)
sentence2_embedding = lstm_sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
#flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(merged_vector)
preds = Dense(4,activation='softmax')(x)

#compile the model
clip_adam = Adam(clipnorm=1.)
lstm_model = Model(input=[sentence1_input, sentence2_input], output=preds)
lstm_model.compile(loss='mse',
             optimizer=clip_adam,
             metrics=['acc'])

In [15]:
lstm_sentence_embedding_model.summary()
lstm_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 100)           0                                            
____________________________________________________________________________________________________
model_1 (Model)                  (None, 100, 300)      10311000    input_8[0][0]                    
____________________________________________________________________________________________________
gru_1 (GRU)                      (None, 128)           164736      model_1[3][0]                    
Total params: 10,475,736
Trainable params: 164,736
Non-trainable params: 10,311,000
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Layer (

In [16]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
x_train = f['x_train']
y_train = f['y_train']
x_val = f['x_val']
y_val = f['y_val']
print("Done.")

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of x_val: {}".format(np.shape(x_val)))
print("Shape of y_val: {}".format(np.shape(y_val)))

batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of steps until epoch is finished
nb_val_samples = int(np.floor(np.shape(y_val)[0] / batch_size))

print("Training the model...")

lstm_model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          samples_per_epoch = samples_per_epoch * batch_size,
          validation_data=val_data_generator(x_val, y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples * batch_size,
          verbose=2)
print("Done.")

f.close()

Loading training and validation data...
Done.
Shape of x_train: (2, 440122, 100)
Shape of y_train: (440122, 4)
Shape of x_val: (2, 110030, 100)
Shape of y_val: (110030, 4)
Training the model...
Epoch 1/10
395s - loss: 0.1671 - acc: 0.3339 - val_loss: 0.1672 - val_acc: 0.3342
Epoch 2/10
393s - loss: 0.1669 - acc: 0.3344 - val_loss: 0.1670 - val_acc: 0.3342
Epoch 3/10
389s - loss: 0.1669 - acc: 0.3328 - val_loss: 0.1669 - val_acc: 0.3342
Epoch 4/10
389s - loss: 0.1669 - acc: 0.3329 - val_loss: 0.1669 - val_acc: 0.3342
Epoch 5/10
389s - loss: 0.1669 - acc: 0.3328 - val_loss: 0.1669 - val_acc: 0.3342
Epoch 6/10
388s - loss: 0.1669 - acc: 0.3331 - val_loss: 0.1669 - val_acc: 0.3342
Epoch 7/10
389s - loss: 0.1669 - acc: 0.3330 - val_loss: 0.1669 - val_acc: 0.3342
Epoch 8/10
388s - loss: 0.1669 - acc: 0.3329 - val_loss: 0.1669 - val_acc: 0.3330
Epoch 9/10
388s - loss: 0.1669 - acc: 0.3331 - val_loss: 0.1669 - val_acc: 0.3330
Epoch 10/10
389s - loss: 0.1669 - acc: 0.3331 - val_loss: 0.1669 - v

Why do I get nans as loss? -> exploding gradients problem

In [None]:
lstm_model.save_weights('../data/lstm_model_weights.hdf5')

Not too bad! Next up:
- improving the convolution architecture (num filters, maxpooling size)
- trying out lstms for encoding the sentences
- testing with the actual repeval dataset

A recursive network implementation in theano: https://github.com/ofirnachum/tree_rnn