# Repeval 2017 Exporations

In [1]:
from keras import backend as K
K.set_image_dim_ordering('th')

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1050 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5110)


In [2]:
from gensim.models import KeyedVectors

In [3]:
import numpy as np
import pandas as pd
from keras.models import Model, save_model, load_model
from keras.layers import Input, Embedding, Flatten, Dense
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.layers import merge
from keras.utils.np_utils import to_categorical
import h5py


First, load pre-trained word embeddings. Here, the ones from Mikolov using the word2vec toolkit

In [4]:
W2V_BINARY_PATH = '../data/GoogleNews-vectors-negative300.bin'
vectorspace = KeyedVectors.load_word2vec_format(W2V_BINARY_PATH, binary=True)

Load the data. Starting here with the SNLI Corpus until further data are available for the repeval2017 task.

In [11]:
data_path = '../data/'
snli_data_frame = pd.read_csv(data_path + 'snli_1.0/snli_1.0_train.txt', sep='\t')
snli_gold_labels = snli_data_frame.gold_label.tolist()
snli_sentences1 = snli_data_frame.sentence1.tolist()
snli_sentences2 = snli_data_frame.sentence2.tolist()

In [12]:
mnli_data_frame = pd.read_csv(data_path + 'multinli_0.9/multinli_0.9_train.txt', sep='\t', error_bad_lines=False)
mnli_gold_labels = mnli_data_frame.gold_label.tolist()
mnli_sentences1 = mnli_data_frame.sentence1.tolist()
mnli_sentences2 = mnli_data_frame.sentence2.tolist()

b'Skipping line 24810: expected 15 fields, saw 16\nSkipping line 33961: expected 15 fields, saw 16\n'
b'Skipping line 75911: expected 15 fields, saw 16\nSkipping line 100114: expected 15 fields, saw 16\n'
b'Skipping line 150638: expected 15 fields, saw 16\nSkipping line 158834: expected 15 fields, saw 16\nSkipping line 173104: expected 15 fields, saw 16\nSkipping line 178252: expected 15 fields, saw 16\n'
b'Skipping line 221951: expected 15 fields, saw 16\n'
b'Skipping line 286845: expected 15 fields, saw 16\nSkipping line 314110: expected 15 fields, saw 16\n'


Use only part of the data for testing the model:

In [13]:
snli_num_samples= len(snli_gold_labels)
snli_samples = [(str(snli_sentences1[i]), str(snli_sentences2[i]), str(snli_gold_labels[i])) for i in range(snli_num_samples)]
print("Using {} training-samples from the snli-trainset".format(snli_num_samples))
mnli_num_samples= len(mnli_gold_labels)
mnli_samples = [(str(mnli_sentences1[i]), str(mnli_sentences2[i]), str(mnli_gold_labels[i])) for i in range(mnli_num_samples)]
print("Using {} training-samples from the multinli-trainset".format(mnli_num_samples))

Using 550152 training-samples from the snli-trainset
Using 391165 training-samples from the multinli-trainset


In [14]:
mnli_test_data_frame = pd.read_csv(data_path + 'multinli_0.9/multinli_0.9_dev_matched.txt', sep='\t', error_bad_lines=False)
mnli_test_gold_labels = mnli_test_data_frame.gold_label.tolist()
mnli_test_sentences1 = mnli_test_data_frame.sentence1.tolist()
mnli_test_sentences2 = mnli_test_data_frame.sentence2.tolist()

In [15]:
snli_test_data_frame = pd.read_csv(data_path + 'snli_1.0/snli_1.0_test.txt', sep='\t')
snli_test_gold_labels = snli_test_data_frame.gold_label.tolist()
snli_test_sentences1 = snli_test_data_frame.sentence1.tolist()
snli_test_sentences2 = snli_test_data_frame.sentence2.tolist()

In [16]:
snli_test_num_samples= len(snli_test_gold_labels)
snli_test_samples = [(str(snli_test_sentences1[i]), str(snli_test_sentences2[i]), str(snli_test_gold_labels[i])) for i in range(snli_test_num_samples)]
print("Using {} testing-samples from the snli-testset".format(snli_test_num_samples))
mnli_test_num_samples= len(mnli_test_gold_labels)
mnli_test_samples = [(str(mnli_test_sentences1[i]), str(mnli_test_sentences2[i]), str(mnli_test_gold_labels[i])) for i in range(mnli_test_num_samples)]
print("Using {} testing-samples from the multinli-testset".format(mnli_test_num_samples))

Using 10000 testing-samples from the snli-testset
Using 9897 testing-samples from the multinli-testset


Use h5py to store the data. This helps to use less RAM for the training data in training the models.

## Training a model with some data

Preprocessing the data to include the word embeddings into the model is done after this keras-example:
https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py

In [17]:
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
VALIDATION_SPLIT = 0.0 #there is some extra testing data

A method for padding without using numpy, which breaks when building data arrays with 500000 or more embedded vectors.

In [18]:
def simple_padding(sequences, maxlen=100):
    for sequence in sequences:
        while len(sequence) < maxlen:
            sequence.append(0)
        if len(sequence) > maxlen:
            del sequence[maxlen:]
    return sequences

Turn string sequences into integer sequences, pad them to equal length, divide into training and validation data, store using the hdf5 binary format.

In [19]:
str_sentences1 = [sentence for sentence, sentence2, label in snli_samples + mnli_samples]
str_sentences2 = [sentence2 for sentence, sentence2, label in snli_samples + mnli_samples]
labels = [label for sentence, sentence2, label in snli_samples + mnli_samples] 

snli_test_str_sentences1 = [sentence for sentence, sentence2, label in snli_test_samples]
snli_test_str_sentences2 = [sentence2 for sentence, sentence2, label in snli_test_samples]
snli_test_labels = [label for sentence, sentence2, label in snli_test_samples] 

mnli_test_str_sentences1 = [sentence for sentence, sentence2, label in mnli_test_samples]
mnli_test_str_sentences2 = [sentence2 for sentence, sentence2, label in mnli_test_samples]
mnli_test_labels = [label for sentence, sentence2, label in mnli_test_samples] 

numeric_labels = []
for label in labels + snli_test_labels + mnli_test_labels:
    if label == 'neutral':
        numeric_labels.append(0)
    elif label == 'contradiction':
        numeric_labels.append(1)
    elif label == 'entailment':
        numeric_labels.append(2)
    else:
        numeric_labels.append(3)
        
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(str_sentences1 + str_sentences2 + 
                       snli_test_str_sentences1 + snli_test_str_sentences2 +
                      mnli_test_str_sentences1 + mnli_test_str_sentences2)
sequences1 = tokenizer.texts_to_sequences(str_sentences1)
sequences2 = tokenizer.texts_to_sequences(str_sentences2)
snli_test_sequences1 = tokenizer.texts_to_sequences(snli_test_str_sentences1)
snli_test_sequences2 = tokenizer.texts_to_sequences(snli_test_str_sentences2)
mnli_test_sequences1 = tokenizer.texts_to_sequences(mnli_test_str_sentences1)
mnli_test_sequences2 = tokenizer.texts_to_sequences(mnli_test_str_sentences2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 91460 unique tokens.


In [None]:
print("shape of sequences1: {}.".format(np.shape(sequences1)))
print("shape of sequences2: {}.".format(np.shape(sequences2)))
print("shape of snli_test_sequences1: {}.".format(np.shape(snli_test_sequences1)))
print("shape of snli_test_sequences2: {}.".format(np.shape(snli_test_sequences2)))
print("mhape of snli_test_sequences1: {}.".format(np.shape(mnli_test_sequences1)))
print("mhape of snli_test_sequences2: {}.".format(np.shape(mnli_test_sequences2)))

data1 = simple_padding(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
data2 = simple_padding(sequences2, maxlen=MAX_SEQUENCE_LENGTH)
snli_test_data1 = simple_padding(snli_test_sequences1, maxlen=MAX_SEQUENCE_LENGTH)
snli_test_data2 = simple_padding(snli_test_sequences2, maxlen=MAX_SEQUENCE_LENGTH)
mnli_test_data1 = simple_padding(mnli_test_sequences1, maxlen=MAX_SEQUENCE_LENGTH)
mnli_test_data2 = simple_padding(mnli_test_sequences2, maxlen=MAX_SEQUENCE_LENGTH)

print("shape of data1: {}.".format(np.shape(data1)))
print("shape of data2: {}.".format(np.shape(data2)))
print("shape of snli_test_data1: {}.".format(np.shape(snli_test_data1)))
print("shape of snli_test_data2: {}.".format(np.shape(snli_test_data2)))
print("shape of mnli_test_data1: {}.".format(np.shape(mnli_test_data1)))
print("shape of mnli_test_data2: {}.".format(np.shape(mnli_test_data2)))

training_data = np.asarray([data1, data2])
snli_testing_data = np.asarray([snli_test_data1, snli_test_data2])
mnli_testing_data = np.asarray([mnli_test_data1, mnli_test_data2])

all_labels = to_categorical(np.asarray(numeric_labels))
training_labels = all_labels[:len(labels)]
snli_testing_labels = all_labels[len(labels):len(labels) + len(snli_test_labels)]
mnli_testing_labels = all_labels[len(labels) + len(snli_test_labels):]

##########CONTINUE HERE###############

print('Shape of training_data tensor:', training_data.shape)
print('Shape of training_labels tensor:', training_labels.shape)
print('Shape of snli_testing_data tensor:', snli_testing_data.shape)
print('Shape of snli_testing_labels tensor:', snli_testing_labels.shape)
print('Shape of mnli_testing_data tensor:', mnli_testing_data.shape)
print('Shape of mnli_testing_labels tensor:', mnli_testing_labels.shape)

# not needed here, because training and testing data are already splitted
# split the data into a training set and a validation set
#indices = np.arange(data.shape[1])
#np.random.shuffle(indices)
#data = data[:,indices,:]
#labels = labels[indices,:]
#num_validation_samples = int(VALIDATION_SPLIT * data.shape[1])

#x_train = data[:,:-num_validation_samples,:]
#y_train = labels[:-num_validation_samples]
#x_val = data[:,-num_validation_samples:,:]
#y_val = labels[-num_validation_samples:]

#print("Shape of x_train: {}".format(np.shape(x_train)))
#print("Shape of y_train: {}".format(np.shape(y_train)))
#print("Shape of x_val: {}".format(np.shape(x_val)))
#print("Shape of y_val: {}".format(np.shape(y_val)))

print("Storing training and test data to hdf5...")

f = h5py.File('../data/deep_training_data.hdf5', 'a')
if not 'training_data' in f: x_train = f.create_dataset('training_data', data=training_data)
if not 'training_labels' in f: y_train = f.create_dataset('training_labels', data=training_labels)
if not 'snli_testing_data' in f: snli_x_val = f.create_dataset('snli_testing_data', data=snli_testing_data)
if not 'snli_testing_labels' in f: snli_y_val = f.create_dataset('snli_testing_labels', data=snli_testing_labels)
if not 'mnli_testing_data' in f: mnli_x_val = f.create_dataset('mnli_testing_data', data=mnli_testing_data)
if not 'mnli_testing_labels' in f: mnli_y_val = f.create_dataset('mnli_testing_labels', data=mnli_testing_labels)
    
print("Shape of x_train: {}".format(np.shape(f['training_data'])))
print("Shape of y_train: {}".format(np.shape(f['training_labels'])))
print("Shape of snli_x_val: {}".format(np.shape(f['snli_testing_data'])))
print("Shape of snli_y_val: {}".format(np.shape(f['snli_testing_labels'])))
print("Shape of mnli_x_val: {}".format(np.shape(f['mnli_testing_data'])))
print("Shape of mnli_y_val: {}".format(np.shape(f['mnli_testing_labels'])))
f.close()
print("Done.")

Compute an embedding matrix (the first layer of later models) and store it for later use.

In [15]:
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM)) #TODO: is this correct? In the example, the first dimension is
# num_words, this, however, throws an error when populating the embedding matrix (because the word-indices start at 1, not at 0)
print("shape of embedding matrix: {}".format(np.shape(embedding_matrix)))

for word, i in word_index.items():
    #print("{}: {}".format(i,word))
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = None
    
    try:
        embedding_vector = vectorspace[word]
    except:
        pass
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) #set trainable = True to enable training of the embeddings to the task at hand

#save as a model for later use
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sentence = embedding_layer(sentence_input)

embedding_model = Model(input=sentence_input, output=embedded_sentence)
embedding_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

print("Saving model...")
save_model(embedding_model, '../data/embedding_layer.hdf5')
print(embedding_layer.input_dim)
print(embedding_layer.output_dim)

Preparing embedding matrix.
shape of embedding matrix: (91461, 300)




Saving model...
91461
300


In [5]:
print('Loading model...')
embedding_model = load_model('../data/embedding_layer.hdf5')
print('Done.')
print(embedding_model.input_shape)
print(embedding_model.output_shape)
print(np.shape(embedding_model.get_weights()))
print(embedding_model.summary())

Loading model...
Done.
(None, 100)
(None, 100, 300)
(1, 91461, 300)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          27438300  
Total params: 27,438,300
Trainable params: 0
Non-trainable params: 27,438,300
_________________________________________________________________
None


## Model Architectures

The most simple model for encoding the sentences:

In [20]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#encode first sentence
embedded_sentence1 = embedding_model(sentence1_input)
x = Conv1D(128, 5, activation='relu')(embedded_sentence1)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence1 = MaxPooling1D(35)(x)

#encode second sentence
embedded_sentence2 = embedding_layer(sentence2_input)
y = Conv1D(128, 5, activation='relu')(embedded_sentence2)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
encoded_sentence2 = MaxPooling1D(35)(y)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [encoded_sentence1, encoded_sentence2], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model1 = Model(input=[sentence1_input, sentence2_input], output=preds)
model1.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [21]:
model1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 300)     2482200     input_11[0][0]                   
                                                                   input_12[0][0]                   
____________________________________________________________________________________________________
convolution1d_10 (Convolution1D) (None, 996, 128)      192128      embedding_1[3][0]       

Nearly the same model but using a shared architecture for embedding both sentences, so that it will profit from all examples:

In [6]:
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_model(sentence_input)
x = Conv1D(128, 3, activation='relu')(sentence_embedding)
x = MaxPooling1D(4)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
encoded_sentence = MaxPooling1D(5)(x)

sentence_embedding_model = Model(inputs=sentence_input, outputs=encoded_sentence)

sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence1_embedding = sentence_embedding_model(sentence1_input)
sentence2_embedding = sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
conv_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
conv_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

  name=name)


In [7]:
sentence_embedding_model.summary()
conv_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 100, 300)          27438300  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 128)           115328    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 24, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 22, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 7, 128)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 5, 128)            49280     
__________

Question for later: why doesn't convolution change the output size? Shouldnt it go down from to 100 to 98? -> solved, was because of bordermode = "same".

## Training the model with the snli dataset

In [None]:
print("Shape of x_train: {}".format(np.shape(f['training_data'])))
print("Shape of y_train: {}".format(np.shape(f['training_labels'])))
print("Shape of snli_x_val: {}".format(np.shape(f['snli_testing_data'])))
print("Shape of snli_y_val: {}".format(np.shape(f['snli_testing_labels'])))
print("Shape of mnli_x_val: {}".format(np.shape(f['mnli_testing_data'])))
print("Shape of mnli_y_val: {}".format(np.shape(f['mnli_testing_labels'])))

In [8]:
def training_data_generator(x_train, y_train, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_train[0,i:i+batch_size,:],x_train[1,i:i+batch_size,:]], y_train[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

def val_data_generator(x_test, y_test, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_test[0,i:i+batch_size,:],x_test[1,i:i+batch_size,:]], y_test[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

In [9]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
x_train = f['training_data']
y_train = f['training_labels']
snli_x_val = f['snli_testing_data']
snli_y_val = f['snli_testing_labels']
mnli_x_val = f['mnli_testing_data']
mnli_y_val = f['mnli_testing_labels']
print("Done.")

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of snli_x_val: {}".format(np.shape(snli_x_val)))
print("Shape of snli_y_val: {}".format(np.shape(snli_y_val)))
print("Shape of mnli_x_val: {}".format(np.shape(mnli_x_val)))
print("Shape of mnli_y_val: {}".format(np.shape(mnli_y_val)))
       
batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of batches until epoch is finished
nb_val_samples = int(np.floor(np.shape(mnli_y_val)[0] / batch_size))

print("Batches pro epoche training: {}".format(samples_per_epoch))
print("Batches pro epoche validation: {}".format(nb_val_samples))

print("Training the model...")

conv_model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          steps_per_epoch = samples_per_epoch,
          validation_data=val_data_generator(snli_x_val, snli_y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples,
          verbose=2)
print("Done.")

f.close()

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Batches pro epoche training: 7354
Batches pro epoche validation: 77
Training the model...




Epoch 1/10
276s - loss: 0.8259 - acc: 0.6272 - val_loss: 1.0685 - val_acc: 0.5619
Epoch 2/10
275s - loss: 0.7438 - acc: 0.6777 - val_loss: 1.0156 - val_acc: 0.6072
Epoch 3/10
274s - loss: 0.7074 - acc: 0.6980 - val_loss: 1.0064 - val_acc: 0.6156
Epoch 4/10
274s - loss: 0.6845 - acc: 0.7097 - val_loss: 1.0167 - val_acc: 0.6402
Epoch 5/10
273s - loss: 0.6674 - acc: 0.7188 - val_loss: 1.0357 - val_acc: 0.6410
Epoch 6/10
272s - loss: 0.6538 - acc: 0.7257 - val_loss: 0.9737 - val_acc: 0.6551
Epoch 7/10
272s - loss: 0.6420 - acc: 0.7318 - val_loss: 0.9768 - val_acc: 0.6572
Epoch 8/10
275s - loss: 0.6325 - acc: 0.7367 - val_loss: 0.9752 - val_acc: 0.6526
Epoch 9/10
275s - loss: 0.6244 - acc: 0.7408 - val_loss: 0.9748 - val_acc: 0.6614
Epoch 10/10
273s - loss: 0.6170 - acc: 0.7445 - val_loss: 0.9687 - val_acc: 0.6607
Done.


In [10]:
print("saving the model...")
conv_model.save_weights('../models/conv_model_snli+mnli.hdf5')
print("done.")

saving the model...
done,


## Recurrent Models

In [52]:
from keras.layers import GRU
from keras.optimizers import Adam

sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_model(sentence_input)
encoded_sentence = GRU(32, activation='relu')(sentence_embedding)
#TODO: stack lstms here
lstm_sentence_embedding_model = Model(inputs=sentence_input, outputs=encoded_sentence)

sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence1_embedding = lstm_sentence_embedding_model(sentence1_input)
sentence2_embedding = lstm_sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
#flat = Flatten()(merged_vector)
x = Dense(64, activation='relu')(merged_vector)
preds = Dense(4,activation='softmax')(x)

#compile the model
clip_adam = Adam(clipnorm=1.)
lstm_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
lstm_model.compile(loss='mse',
             optimizer=clip_adam,
             metrics=['acc'])

  name=name)


In [53]:
lstm_sentence_embedding_model.summary()
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 100, 300)          27438300  
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                31968     
Total params: 27,470,268
Trainable params: 31,968
Non-trainable params: 27,438,300
_________________________________________________________________
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 100)           0                                            
______________________________________________________________________________________

In [57]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
x_train = f['training_data']
y_train = f['training_labels']
snli_x_val = f['snli_testing_data']
snli_y_val = f['snli_testing_labels']
mnli_x_val = f['mnli_testing_data']
mnli_y_val = f['mnli_testing_labels']
print("Done.")

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of snli_x_val: {}".format(np.shape(snli_x_val)))
print("Shape of snli_y_val: {}".format(np.shape(snli_y_val)))
print("Shape of mnli_x_val: {}".format(np.shape(mnli_x_val)))
print("Shape of mnli_y_val: {}".format(np.shape(mnli_y_val)))

batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of steps until epoch is finished
nb_val_samples = int(np.floor(np.shape(snli_y_val)[0] / batch_size))

print("Training the GRU model...")

lstm_model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          samples_per_epoch = samples_per_epoch * batch_size,
          validation_data=val_data_generator(snli_x_val, snli_y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples * batch_size,
          verbose=2)
print("Done.")

f.close()

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Training the GRU model...


INFO (theano.gof.compilelock): Refreshing lock /home/fabian/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.0-64/lock_dir/lock


Epoch 1/10


KeyboardInterrupt: 

Why do I get nans as loss? -> exploding gradients problem

In [None]:
print("saving gru model...")
lstm_model.save_weights('../data/lstm_model_weights.hdf5')
print("done.")

Not too bad! Next up:
- improving the convolution architecture (num filters, maxpooling size)
- trying out lstms for encoding the sentences
- testing with the actual repeval dataset

A recursive network implementation in theano: https://github.com/ofirnachum/tree_rnn

# Error Analysis

Analyze which sentences are classified wrong and why:
- print out target, prediction, sentence 1 and sentence 2 of wrongly classified samples (save to file)

In [50]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
snli_x_val = f['snli_testing_data']
snli_y_val = f['snli_testing_labels']
mnli_x_val = f['mnli_testing_data']
mnli_y_val = f['mnli_testing_labels']
print("Done.")

print("Shape of snli samples: {}".format(np.shape(snli_x_val)))
print("Shape of snli targets: {}".format(np.shape(snli_y_val)))
print("Shape of mnli samples: {}".format(np.shape(mnli_x_val)))
print("Shape of mnli samples: {}".format(np.shape(mnli_y_val)))

classification_errors = []

index_to_word = {v: k for k, v in word_index.items()} #maps from integer word indices to the words themselves
label_to_word = {0: 'neutral', 1: 'contradiction', 2:'entailment', 3:'other'}

num_samples = np.shape(mnli_x_val)[1]
                 
for i in range(num_samples):
    sequence1 = np.reshape(mnli_x_val[0,i,:], (1,100))
    sequence2 = np.reshape(mnli_x_val[1,i,:], (1,100))
    y_pred = np.argmax(conv_model.predict([sequence1, sequence2]))
    y_true = np.argmax(mnli_y_val[i])
    #print(y_pred);print(y_true)
    if y_pred != y_true:
        #TODO: reconstruct sentences from sequences
        sentence1 = ""
        for index in sequence1[0]:
            if index != 0: sentence1 += index_to_word[index] + " "
        sentence2 = ""
        for index in sequence2[0]:
            if index != 0: sentence2 += index_to_word[index] + " "
        classification_errors.append((y_true, y_pred, sentence1, sentence2))

print("Found {} false classifications.".format(len(classification_errors)))
                 
with open('../results/classification_errors.txt', "w") as file:
    for y_true, y_false, sentence1, sentence2 in classification_errors:
        file.write('----------------------\nTARGET: {}, PRED: {}\nSENTENCE1: {}\nSENTENCE2: {}\n'.format(label_to_word[y_true], label_to_word[y_false], sentence1, sentence2))

                 
f.close()

Loading training and validation data...
Done.
Shape of snli samples: (2, 10000, 100)
Shape of snli targets: (10000, 4)
Shape of mnli samples: (2, 9897, 100)
Shape of mnli samples: (9897, 4)
Found 3869 false classifications.
