# Repeval 2017 Exporations

In [1]:
LOCAL_MODEL_PATH = "/home/fabian/work/ag_sc/repeval2017/models/"
HARD_DRIVE_MODEL_PATH = "/media/fabian/MACWIN/deep_models/"

RESULT-LIST:

max + avg pooling makes no real difference in Conv+Lstm
- zwei LSTMS
- Conv, LSTM, Attention (rest like below)
        - SNLI acc 0.73
        - MNLI acc 0.64
- Conv (128, kernelsize 5) + LSTM 64 + 521 Dense -> 64 Dense, 10 epochs, Adam -> 
        - SNLI acc 0.71 
        - MNLI acc 0.62
- Conv_pool, 3 modules, 10 epochs, ADAM
        - SNLI acc 0.68
        - MNLI acc 0.61

In [2]:
from keras import backend as K
K.set_image_dim_ordering('th')


Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1050 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5110)


In [3]:
from gensim.models import KeyedVectors

In [4]:
import numpy as np
import pandas as pd
from keras.models import Model, save_model, load_model
from keras.layers import Input, Embedding, Flatten, Dense
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.layers import merge
from keras.utils.np_utils import to_categorical
from keras.layers import GRU, LSTM, Bidirectional, Dropout, TimeDistributed
from keras.layers.convolutional import Conv1D, MaxPooling1D, AveragePooling1D
from keras.optimizers import Adam
import h5py
import pickle

First, load pre-trained word embeddings. Here, the ones from Mikolov using the word2vec toolkit

In [5]:
W2V_BINARY_PATH = '../data/GoogleNews-vectors-negative300.bin'
vectorspace = KeyedVectors.load_word2vec_format(W2V_BINARY_PATH, binary=True)

KeyboardInterrupt: 

Load the data. Starting here with the SNLI Corpus until further data are available for the repeval2017 task.

In [None]:
data_path = '../data/'
snli_data_frame = pd.read_csv(data_path + 'snli_1.0/snli_1.0_train.txt', sep='\t')
snli_gold_labels = snli_data_frame.gold_label.tolist()
snli_sentences1 = snli_data_frame.sentence1.tolist()
snli_sentences2 = snli_data_frame.sentence2.tolist()

In [None]:
mnli_data_frame = pd.read_csv(data_path + 'multinli_0.9/multinli_0.9_train.txt', sep='\t', error_bad_lines=False)
mnli_gold_labels = mnli_data_frame.gold_label.tolist()
mnli_sentences1 = mnli_data_frame.sentence1.tolist()
mnli_sentences2 = mnli_data_frame.sentence2.tolist()

Use only part of the data for testing the model:

In [None]:
snli_num_samples= len(snli_gold_labels)
snli_samples = [(str(snli_sentences1[i]), str(snli_sentences2[i]), str(snli_gold_labels[i])) for i in range(snli_num_samples)]
print("Using {} training-samples from the snli-trainset".format(snli_num_samples))
mnli_num_samples= len(mnli_gold_labels)
mnli_samples = [(str(mnli_sentences1[i]), str(mnli_sentences2[i]), str(mnli_gold_labels[i])) for i in range(mnli_num_samples)]
print("Using {} training-samples from the multinli-trainset".format(mnli_num_samples))

In [None]:
mnli_test_data_frame = pd.read_csv(data_path + 'multinli_0.9/multinli_0.9_dev_matched.txt', sep='\t', error_bad_lines=False)
mnli_test_gold_labels = mnli_test_data_frame.gold_label.tolist()
mnli_test_sentences1 = mnli_test_data_frame.sentence1.tolist()
mnli_test_sentences2 = mnli_test_data_frame.sentence2.tolist()

In [None]:
snli_test_data_frame = pd.read_csv(data_path + 'snli_1.0/snli_1.0_test.txt', sep='\t')
snli_test_gold_labels = snli_test_data_frame.gold_label.tolist()
snli_test_sentences1 = snli_test_data_frame.sentence1.tolist()
snli_test_sentences2 = snli_test_data_frame.sentence2.tolist()

In [None]:
snli_test_num_samples= len(snli_test_gold_labels)
snli_test_samples = [(str(snli_test_sentences1[i]), str(snli_test_sentences2[i]), str(snli_test_gold_labels[i])) for i in range(snli_test_num_samples)]
print("Using {} testing-samples from the snli-testset".format(snli_test_num_samples))
mnli_test_num_samples= len(mnli_test_gold_labels)
mnli_test_samples = [(str(mnli_test_sentences1[i]), str(mnli_test_sentences2[i]), str(mnli_test_gold_labels[i])) for i in range(mnli_test_num_samples)]
print("Using {} testing-samples from the multinli-testset".format(mnli_test_num_samples))

Use h5py to store the data. This helps to use less RAM for the training data in training the models.

## Training a model with some data

Preprocessing the data to include the word embeddings into the model is done after this keras-example:
https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py

In [5]:
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
VALIDATION_SPLIT = 0.0 #there is some extra testing data

A method for padding without using numpy, which breaks when building data arrays with 500000 or more embedded vectors.

In [6]:
def simple_padding(sequences, maxlen=100):
    for sequence in sequences:
        while len(sequence) < maxlen:
            sequence.append(0)
        if len(sequence) > maxlen:
            del sequence[maxlen:]
    return sequences

Turn string sequences into integer sequences, pad them to equal length, divide into training and validation data, store using the hdf5 binary format.

In [None]:
str_sentences1 = [sentence for sentence, sentence2, label in snli_samples + mnli_samples]
str_sentences2 = [sentence2 for sentence, sentence2, label in snli_samples + mnli_samples]
labels = [label for sentence, sentence2, label in snli_samples + mnli_samples] 

snli_test_str_sentences1 = [sentence for sentence, sentence2, label in snli_test_samples]
snli_test_str_sentences2 = [sentence2 for sentence, sentence2, label in snli_test_samples]
snli_test_labels = [label for sentence, sentence2, label in snli_test_samples] 

mnli_test_str_sentences1 = [sentence for sentence, sentence2, label in mnli_test_samples]
mnli_test_str_sentences2 = [sentence2 for sentence, sentence2, label in mnli_test_samples]
mnli_test_labels = [label for sentence, sentence2, label in mnli_test_samples] 

numeric_labels = []
for label in labels + snli_test_labels + mnli_test_labels:
    if label == 'neutral':
        numeric_labels.append(0)
    elif label == 'contradiction':
        numeric_labels.append(1)
    elif label == 'entailment':
        numeric_labels.append(2)
    else:
        numeric_labels.append(3)
        
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(str_sentences1 + str_sentences2 + 
                       snli_test_str_sentences1 + snli_test_str_sentences2 +
                      mnli_test_str_sentences1 + mnli_test_str_sentences2)
sequences1 = tokenizer.texts_to_sequences(str_sentences1)
sequences2 = tokenizer.texts_to_sequences(str_sentences2)
snli_test_sequences1 = tokenizer.texts_to_sequences(snli_test_str_sentences1)
snli_test_sequences2 = tokenizer.texts_to_sequences(snli_test_str_sentences2)
mnli_test_sequences1 = tokenizer.texts_to_sequences(mnli_test_str_sentences1)
mnli_test_sequences2 = tokenizer.texts_to_sequences(mnli_test_str_sentences2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


In [None]:
print("Saving Tokenizer...")
pickle.dump(tokenizer, open("../data/tokenizer.p", "wb"))
print("Done.")

In [None]:
print("Saving word index...")
pickle.dump(word_index, open("../data/word_index.p", "wb"))
print("Done.")

In [None]:
print("Loading word index...")
word_index = pickle.load(open("../data/word_index.p", "rb"))
print("Done.")

In [None]:
print(type(tokenizer))

In [None]:
print("shape of sequences1: {}.".format(np.shape(sequences1)))
print("shape of sequences2: {}.".format(np.shape(sequences2)))
print("shape of snli_test_sequences1: {}.".format(np.shape(snli_test_sequences1)))
print("shape of snli_test_sequences2: {}.".format(np.shape(snli_test_sequences2)))
print("mhape of snli_test_sequences1: {}.".format(np.shape(mnli_test_sequences1)))
print("mhape of snli_test_sequences2: {}.".format(np.shape(mnli_test_sequences2)))

data1 = simple_padding(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
data2 = simple_padding(sequences2, maxlen=MAX_SEQUENCE_LENGTH)
snli_test_data1 = simple_padding(snli_test_sequences1, maxlen=MAX_SEQUENCE_LENGTH)
snli_test_data2 = simple_padding(snli_test_sequences2, maxlen=MAX_SEQUENCE_LENGTH)
mnli_test_data1 = simple_padding(mnli_test_sequences1, maxlen=MAX_SEQUENCE_LENGTH)
mnli_test_data2 = simple_padding(mnli_test_sequences2, maxlen=MAX_SEQUENCE_LENGTH)

print("shape of data1: {}.".format(np.shape(data1)))
print("shape of data2: {}.".format(np.shape(data2)))
print("shape of snli_test_data1: {}.".format(np.shape(snli_test_data1)))
print("shape of snli_test_data2: {}.".format(np.shape(snli_test_data2)))
print("shape of mnli_test_data1: {}.".format(np.shape(mnli_test_data1)))
print("shape of mnli_test_data2: {}.".format(np.shape(mnli_test_data2)))

training_data = np.asarray([data1, data2])
snli_testing_data = np.asarray([snli_test_data1, snli_test_data2])
mnli_testing_data = np.asarray([mnli_test_data1, mnli_test_data2])

all_labels = to_categorical(np.asarray(numeric_labels))
training_labels = all_labels[:len(labels)]
snli_testing_labels = all_labels[len(labels):len(labels) + len(snli_test_labels)]
mnli_testing_labels = all_labels[len(labels) + len(snli_test_labels):]

##########CONTINUE HERE###############

print('Shape of training_data tensor:', training_data.shape)
print('Shape of training_labels tensor:', training_labels.shape)
print('Shape of snli_testing_data tensor:', snli_testing_data.shape)
print('Shape of snli_testing_labels tensor:', snli_testing_labels.shape)
print('Shape of mnli_testing_data tensor:', mnli_testing_data.shape)
print('Shape of mnli_testing_labels tensor:', mnli_testing_labels.shape)

# not needed here, because training and testing data are already splitted
# split the data into a training set and a validation set
#indices = np.arange(data.shape[1])
#np.random.shuffle(indices)
#data = data[:,indices,:]
#labels = labels[indices,:]
#num_validation_samples = int(VALIDATION_SPLIT * data.shape[1])

#x_train = data[:,:-num_validation_samples,:]
#y_train = labels[:-num_validation_samples]
#x_val = data[:,-num_validation_samples:,:]
#y_val = labels[-num_validation_samples:]

#print("Shape of x_train: {}".format(np.shape(x_train)))
#print("Shape of y_train: {}".format(np.shape(y_train)))
#print("Shape of x_val: {}".format(np.shape(x_val)))
#print("Shape of y_val: {}".format(np.shape(y_val)))

print("Storing training and test data to hdf5...")

f = h5py.File('../data/deep_training_data.hdf5', 'a')
if not 'training_data' in f: x_train = f.create_dataset('training_data', data=training_data)
if not 'training_labels' in f: y_train = f.create_dataset('training_labels', data=training_labels)
if not 'snli_testing_data' in f: snli_x_val = f.create_dataset('snli_testing_data', data=snli_testing_data)
if not 'snli_testing_labels' in f: snli_y_val = f.create_dataset('snli_testing_labels', data=snli_testing_labels)
if not 'mnli_testing_data' in f: mnli_x_val = f.create_dataset('mnli_testing_data', data=mnli_testing_data)
if not 'mnli_testing_labels' in f: mnli_y_val = f.create_dataset('mnli_testing_labels', data=mnli_testing_labels)
    
print("Shape of x_train: {}".format(np.shape(f['training_data'])))
print("Shape of y_train: {}".format(np.shape(f['training_labels'])))
print("Shape of snli_x_val: {}".format(np.shape(f['snli_testing_data'])))
print("Shape of snli_y_val: {}".format(np.shape(f['snli_testing_labels'])))
print("Shape of mnli_x_val: {}".format(np.shape(f['mnli_testing_data'])))
print("Shape of mnli_y_val: {}".format(np.shape(f['mnli_testing_labels'])))
f.close()
print("Done.")

Compute an embedding matrix (the first layer of later models) and store it for later use.

In [6]:
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM)) #TODO: is this correct? In the example, the first dimension is
# num_words, this, however, throws an error when populating the embedding matrix (because the word-indices start at 1, not at 0)
print("shape of embedding matrix: {}".format(np.shape(embedding_matrix)))

for word, i in word_index.items():
    #print("{}: {}".format(i,word))
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = None
    
    try:
        embedding_vector = vectorspace[word]
    except:
        pass
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) #set trainable = True to enable training of the embeddings to the task at hand

#save as a model for later use
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sentence = embedding_layer(sentence_input)

embedding_model = Model(input=sentence_input, output=embedded_sentence)
embedding_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

print("Saving model...")
save_model(embedding_model, '../data/embedding_layer.hdf5')
print(embedding_layer.input_dim)
print(embedding_layer.output_dim)

Preparing embedding matrix.


NameError: name 'MAX_NB_WORDS' is not defined

In [7]:
print('Loading model...')
embedding_model = load_model(LOCAL_MODEL_PATH + 'embedding_layer.hdf5')
print('Done.')
print(embedding_model.input_shape)
print(embedding_model.output_shape)
print(np.shape(embedding_model.get_weights()))
print(embedding_model.summary())

Loading model...
Done.
(None, 100)
(None, 100, 300)
(1, 91461, 300)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          27438300  
Total params: 27,438,300
Trainable params: 0
Non-trainable params: 27,438,300
_________________________________________________________________
None


## Model Architectures

The most simple model for encoding the sentences:

In [None]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#encode first sentence
embedded_sentence1 = embedding_model(sentence1_input)
x = Conv1D(128, 5, activation='relu')(embedded_sentence1)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence1 = MaxPooling1D(35)(x)

#encode second sentence
embedded_sentence2 = embedding_layer(sentence2_input)
y = Conv1D(128, 5, activation='relu')(embedded_sentence2)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
encoded_sentence2 = MaxPooling1D(35)(y)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [encoded_sentence1, encoded_sentence2], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model1 = Model(input=[sentence1_input, sentence2_input], output=preds)
model1.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [None]:
model1.summary()

Nearly the same model but using a shared architecture for embedding both sentences, so that it will profit from all examples:

In [10]:
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_model(sentence_input)
x = Conv1D(128, 3, activation='relu')(sentence_embedding)
x = MaxPooling1D(4)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
encoded_sentence = MaxPooling1D(5)(x)

sentence_embedding_model = Model(inputs=sentence_input, outputs=encoded_sentence)

sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence1_embedding = sentence_embedding_model(sentence1_input)
sentence2_embedding = sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(512, activation='relu')(flat)
x = Dense(64, activation='relu')(x)
preds = Dense(4,activation='softmax')(x)

#compile the model
conv_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
conv_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

  name=name)


In [8]:
sentence_embedding_model.summary()
conv_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 100, 300)          27438300  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 128)           115328    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 24, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 22, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 7, 128)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 5, 128)            49280     
__________

Question for later: why doesn't convolution change the output size? Shouldnt it go down from to 100 to 98? -> solved, was because of bordermode = "same".

## Training the model with the snli dataset

In [None]:
print("Shape of x_train: {}".format(np.shape(f['training_data'])))
print("Shape of y_train: {}".format(np.shape(f['training_labels'])))
print("Shape of snli_x_val: {}".format(np.shape(f['snli_testing_data'])))
print("Shape of snli_y_val: {}".format(np.shape(f['snli_testing_labels'])))
print("Shape of mnli_x_val: {}".format(np.shape(f['mnli_testing_data'])))
print("Shape of mnli_y_val: {}".format(np.shape(f['mnli_testing_labels'])))

In [8]:
def training_data_generator(x_train, y_train, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_train[0,i:i+batch_size,:],x_train[1,i:i+batch_size,:]], y_train[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

def val_data_generator(x_test, y_test, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_test[0,i:i+batch_size,:],x_test[1,i:i+batch_size,:]], y_test[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

In [11]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
x_train = f['training_data']
y_train = f['training_labels']
snli_x_val = f['snli_testing_data']
snli_y_val = f['snli_testing_labels']
mnli_x_val = f['mnli_testing_data']
mnli_y_val = f['mnli_testing_labels']
print("Done.")

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of snli_x_val: {}".format(np.shape(snli_x_val)))
print("Shape of snli_y_val: {}".format(np.shape(snli_y_val)))
print("Shape of mnli_x_val: {}".format(np.shape(mnli_x_val)))
print("Shape of mnli_y_val: {}".format(np.shape(mnli_y_val)))
       
batch_size = 256
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of batches until epoch is finished
nb_val_samples = int(np.floor(np.shape(snli_y_val)[0] / batch_size))

print("Batches per epoch training: {}".format(samples_per_epoch))
print("Batches per epoch validation: {}".format(nb_val_samples))

print("Training the model...")

conv_model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          steps_per_epoch = samples_per_epoch,
          validation_data=val_data_generator(snli_x_val, snli_y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples,
          verbose=2)
print("Done.")

f.close()

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Batches per epoch training: 3677
Batches per epoch validation: 39
Training the model...
Epoch 1/10




277s - loss: 0.7022 - acc: 0.7001 - val_loss: 0.9628 - val_acc: 0.6417
Epoch 2/10
276s - loss: 0.6846 - acc: 0.7105 - val_loss: 0.9441 - val_acc: 0.6530
Epoch 3/10
276s - loss: 0.6667 - acc: 0.7199 - val_loss: 0.9145 - val_acc: 0.6642
Epoch 4/10
276s - loss: 0.6522 - acc: 0.7278 - val_loss: 0.9196 - val_acc: 0.6655
Epoch 5/10
275s - loss: 0.6402 - acc: 0.7339 - val_loss: 0.9158 - val_acc: 0.6704
Epoch 6/10
275s - loss: 0.6297 - acc: 0.7397 - val_loss: 0.9111 - val_acc: 0.6750
Epoch 7/10
276s - loss: 0.6207 - acc: 0.7443 - val_loss: 0.9108 - val_acc: 0.6799
Epoch 8/10
275s - loss: 0.6129 - acc: 0.7483 - val_loss: 0.9041 - val_acc: 0.6861
Epoch 9/10
275s - loss: 0.6055 - acc: 0.7520 - val_loss: 0.9030 - val_acc: 0.6838
Epoch 10/10
275s - loss: 0.5987 - acc: 0.7552 - val_loss: 0.8948 - val_acc: 0.6875
Done.


In [15]:
print("saving the model...")
conv_model.save_weights(LOCAL_MODEL_PATH + 'conv_model_snli+mnli.hdf5')
conv_model.save_weights(HARD_DRIVE_MODEL_PATH + 'conv_model_snli+mnli.hdf5')
print("done.")

saving the model...
done.


In [12]:
print("loading model weights...")
conv_model.load_weights(HARD_DRIVE_MODEL_PATH + 'conv_model_snli+mnli.hdf5')
print("done.")

loading model weights...
done.


In [9]:
def eval_model(model, dataset, batch_size=256, nb_epochs=10):
    print("Loading training and validation data...")
    f = h5py.File('../data/deep_training_data.hdf5', 'a')
    if dataset == 'snli':
        x_val = f['snli_testing_data']
        y_val = f['snli_testing_labels']
    elif dataset == 'mnli':
        x_val = f['mnli_testing_data']
        y_val = f['mnli_testing_labels']
    print("Done.")

    print("Evaluating on the snli data")
    nb_val_samples = int(np.floor(np.shape(y_val)[0] / batch_size))
    score, acc = model.evaluate_generator(val_data_generator(x_val, y_val, num_batches=nb_val_samples, batch_size= batch_size),
                                steps = nb_val_samples)
    print("Score: {}, Accuracy: {}".format(score, acc))
    f.close()
    return score, acc

In [21]:
score, acc = eval_model(conv_model, "snli")
print("SNLI-results: score = {}, acc = {}".format(score, acc))
score, acc = eval_model(conv_model, "mnli")
print("MNLI-results: score = {}, acc = {}".format(score, acc))

Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.8947591491234608, Accuracy: 0.6875
SNLI-results: score = 0.8947591491234608, acc = 0.6875
Loading training and validation data...
Done.
Evaluating on the snli data
Score: 1.053122070274855, Accuracy: 0.6121504934210527
MNLI-results: score = 1.053122070274855, acc = 0.6121504934210527


### Alternative Convolutional Model that also returns the encoded sentences as vectors

In [None]:
MAX_SEQUENCE_LENGTH

In [None]:
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_model(sentence_input)
x = Conv1D(128, 3, activation='relu')(sentence_embedding)
x = MaxPooling1D(4)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
encoded_sentence = MaxPooling1D(5)(x)

sentence_embedding_model = Model(inputs=sentence_input, outputs=encoded_sentence)

sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence1_embedding = sentence_embedding_model(sentence1_input)
sentence2_embedding = sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
x = Dense(256, activation='relu')(x)
preds = Dense(4,activation='softmax')(x)

#compile the model
alt_conv_model = Model(inputs=[sentence1_input, sentence2_input], outputs=[preds, sentence1_embedding, sentence2_embedding])
alt_conv_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [None]:
print("loading model weights...")
alt_conv_model.load_weights('../models/conv_model_snli+mnli.hdf5')
print("done.")

## Recurrent Models

In [10]:
from keras.layers.core import *
LSTM_UNITS = 64    
SINGLE_ATTENTION_VECTOR = True

### Attention model experiments

In [74]:
from keras.layers import GRU, LSTM, Bidirectional, Dropout, TimeDistributed
from keras.optimizers import Adam

sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#1. Embed
sentence_embedding = embedding_model(sentence_input)

#2. Encode
# see keras example imdb_cnn_lstm
encoded_sentence = Dropout(0.25)(sentence_embedding)

#should output 64 feature maps with 104(narrow) or 106(wide) dims each
output_lstm = Bidirectional(LSTM(24, activation='relu', return_sequences=True))(encoded_sentence)

#3. Attend
#compute importance of sequences
#TODO dimensionen ergeben keinen sinn!
attention = TimeDistributed(Dense(1, activation='tanh'), name='importances')(output_lstm)
attention = Flatten()(attention)
attention = Activation('softmax', name="alphas")(attention)
attention = RepeatVector(48)(attention)
attention = Permute((2,1))(attention)

#elemwise mult with sequences and then weighte sum
encoded_sentence = multiply([output_lstm, attention], name="mul_with_alpha")
encoded_sentence = Lambda(lambda x: K.sum(x, axis=1, keepdims=False), output_shape=(64,), name="weighted_sum")(encoded_sentence)

lstm_simple_attention_encoding_model = Model(inputs=sentence_input, outputs=attention)

In [75]:
lstm_simple_attention_encoding_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_73 (InputLayer)        (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 100, 300)          27438300  
_________________________________________________________________
dropout_13 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 100, 48)           62400     
_________________________________________________________________
importances (TimeDistributed (None, 100, 1)            49        
_________________________________________________________________
flatten_20 (Flatten)         (None, 100)               0         
_________________________________________________________________
alphas (Activation)          (None, 100)               0         
__________

## Second Attention Model
The idea here is to learn weights for all possible combinations of sentence representations (n^2 for n lstm outputs), do softmax for them and then compute a weighted sum over all possible concatentation vectors.

In [12]:
from keras.layers.merge import concatenate, multiply
from keras.layers.core import *

In [13]:
from keras.layers import GRU, LSTM, Bidirectional, Dropout, TimeDistributed
from keras.layers.convolutional import Conv1D, MaxPooling1D, AveragePooling1D
from keras.optimizers import Adam

sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#1. Embed
sentence_embedding = embedding_model(sentence_input)

#2. Encode
# see keras example imdb_cnn_lstm
encoded_sentence = Dropout(0.25)(sentence_embedding)

#should output 64 feature maps with 104(narrow) or 106(wide) dims each
encoded_sentence = Conv1D(64, kernel_size=5, activation="relu", padding="valid", strides=1)(encoded_sentence)
encoded_sentence = MaxPooling1D(pool_size=4)(encoded_sentence)
encoded_sentence = Permute((2,1))(encoded_sentence)
encoded_sentence = Bidirectional(LSTM(24, activation='relu', return_sequences=True))(encoded_sentence)
output_lstm = Bidirectional(LSTM(24, activation='relu', return_sequences=True, name='output_lstm'))(encoded_sentence)
lstm_sentence_encoding_model = Model(inputs=sentence_input, outputs=output_lstm)

In [67]:
lstm_sentence_encoding_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_84 (InputLayer)        (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 100, 300)          27438300  
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 96, 64)            96064     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
permute_18 (Permute)         (None, 64, 24)            0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 64, 48)            9408      
__________

In [35]:

#3. Attention model

sent1_encodings = Input(shape=(64,48))
sent2_encodings = Input(shape=(64,48))

#sent1_permute = Permute((2,1))(sent1_encodings)
#sent2_permute = Permute((2,1))(sent2_encodings)

concat_repr = concatenate(inputs=[sent1_encodings, sent1_encodings])
concat_repr = Permute((2,1))(concat_repr)

importances = TimeDistributed(Dense(1, activation='relu'), name='importances')(concat_repr)
importances = Flatten()(importances)
weights = Activation('softmax')(importances)
weights_broadcasted = RepeatVector(64)(weights)
weights_broadcasted = Permute((2,1))(weights_broadcasted)

#elemwise mult with sequences and then weighte sum
encoded_sentence = multiply([concat_repr, weights_broadcasted],  name="mul_with_alpha")
encoded_sentence = Lambda(lambda x: K.sum(x, axis=2, keepdims=False), output_shape=(96,), name="weighted_sum")(encoded_sentence)

attn_model = Model(inputs=[sent1_encodings, sent2_encodings], outputs=[encoded_sentence])

In [36]:
attn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        (None, 64, 48)            0         
_________________________________________________________________
concatenate_9 (Concatenate)  (None, 64, 96)            0         
_________________________________________________________________
permute_15 (Permute)         (None, 96, 64)            0         
_________________________________________________________________
importances (TimeDistributed (None, 96, 1)             65        
_________________________________________________________________
flatten_8 (Flatten)          (None, 96)                0         
_________________________________________________________________
activation_7 (Activation)    (None, 96)                0         
_________________________________________________________________
repeat_vector_8 (RepeatVecto (None, 64, 96)            0         
__________

In [41]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#embed and encode
sentence1_embedding = lstm_sentence_encoding_model(sentence1_input)
sentence2_embedding = lstm_sentence_encoding_model(sentence2_input)

#attend (returns already merged vectors)
sentences_attended = attn_model(inputs=[sentence1_embedding, sentence2_embedding])

#predict
x = Dense(512, activation='relu')(sentences_attended)
x = Dense(64, activation='relu')(x)
#x = Dense(64, activation='relu')(x)
preds = Dense(4, activation='softmax')(x)

#compile the model
clip_adam = Adam(clipnorm=1.)
lstm_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
lstm_model.compile(loss='mse',
             optimizer=clip_adam,
             metrics=['acc'])

In [42]:
lstm_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_35 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
input_36 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
model_2 (Model)                  (None, 64, 48)        27557788                                     
____________________________________________________________________________________________________
model_10 (Model)                 (None, 96)            65                                           
___________________________________________________________________________________________

In [43]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
x_train = f['training_data']
y_train = f['training_labels']
snli_x_val = f['snli_testing_data']
snli_y_val = f['snli_testing_labels']
mnli_x_val = f['mnli_testing_data']
mnli_y_val = f['mnli_testing_labels']
print("Done.")

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of snli_x_val: {}".format(np.shape(snli_x_val)))
print("Shape of snli_y_val: {}".format(np.shape(snli_y_val)))
print("Shape of mnli_x_val: {}".format(np.shape(mnli_x_val)))
print("Shape of mnli_y_val: {}".format(np.shape(mnli_y_val)))

batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of batches until epoch is finished
nb_val_samples = int(np.floor(np.shape(snli_y_val)[0] / batch_size))

print("Training the Reccurent model...")

lstm_model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          steps_per_epoch = samples_per_epoch,
          validation_data=val_data_generator(snli_x_val, snli_y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples,
          verbose=2)
print("Done.")

f.close()

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Training the Reccurent model...




Epoch 1/10
1100s - loss: 0.1685 - acc: 0.3391 - val_loss: 0.1707 - val_acc: 0.3218
Epoch 2/10
1075s - loss: 0.1667 - acc: 0.3395 - val_loss: 0.1705 - val_acc: 0.3218
Epoch 3/10
1072s - loss: 0.1667 - acc: 0.3393 - val_loss: 0.1705 - val_acc: 0.3218
Epoch 4/10
1070s - loss: 0.1667 - acc: 0.3392 - val_loss: 0.1704 - val_acc: 0.3218
Epoch 5/10
1070s - loss: 0.1667 - acc: 0.3392 - val_loss: 0.1704 - val_acc: 0.3218
Epoch 6/10
1071s - loss: 0.1667 - acc: 0.3392 - val_loss: 0.1704 - val_acc: 0.3218
Epoch 7/10
1070s - loss: 0.1667 - acc: 0.3393 - val_loss: 0.1704 - val_acc: 0.3218
Epoch 8/10
1070s - loss: 0.1667 - acc: 0.3392 - val_loss: 0.1704 - val_acc: 0.3218
Epoch 9/10
1070s - loss: 0.1667 - acc: 0.3392 - val_loss: 0.1704 - val_acc: 0.3218
Epoch 10/10
1070s - loss: 0.1667 - acc: 0.3392 - val_loss: 0.1704 - val_acc: 0.3218
Done.


# Third attention model

TODO:
- nochmal einfach ohne attention laufen lassen
- meinen vorigen attention mechanismus auch draufwerfen
- vergleich welcher besser funktioniert

In [15]:
import theano
import theano.tensor as T

In [17]:
#http://anie.me/Numpy-and-Theano-Indexing-and-Slicing/
x = T.lvector()
y = x[1:2]
y.eval({x: np.asarray([1,2,3,4])})

array([2])

### LSTM encoding

In [19]:
from keras.layers import GRU, LSTM, Bidirectional, Dropout, TimeDistributed
from keras.optimizers import Adam

sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#1. Embed
sentence_embedding = embedding_model(sentence_input)

#2. Encode
# see keras example imdb_cnn_lstm
#encoded_sentence = Dropout(0.25)(sentence_embedding)

#should output 64 feature maps with 104(narrow) or 106(wide) dims each
#encoded_sentence = Bidirectional(LSTM(24, activation='relu', return_sequences=True))(encoded_sentence)
output_lstm = Bidirectional(LSTM(24, activation='relu', return_sequences=True, name='output_lstm'))(sentence_embedding)
lstm_sentence_encoding_model = Model(inputs=sentence_input, outputs=output_lstm)

### Simple Baseline Model without Attention

In [49]:
#baseline_model
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#embed and encode
sentence1_embedding = lstm_sentence_encoding_model(sentence1_input)
sentence2_embedding = lstm_sentence_encoding_model(sentence2_input)

# (no attention)

#predict
x = concatenate(inputs=[sentence1_embedding, sentence2_embedding])
x = Flatten()(x)
x = Dense(512, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)
preds = Dense(4, activation='softmax')(x)

#compile the model
clip_adam = Adam(clipnorm=1.)
baseline_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
baseline_model.compile(loss='mse',
             optimizer=clip_adam,
             metrics=['acc'])

In [50]:
baseline_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_54 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
input_55 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
model_4 (Model)                  (None, 100, 48)       27500700                                     
____________________________________________________________________________________________________
concatenate_24 (Concatenate)     (None, 100, 96)       0                                            
___________________________________________________________________________________________

### Simple Attention Model
Attends sentences seperately. This should only be able to learn important setence positions in general.

In [78]:
from keras.layers import GRU, LSTM, Bidirectional, Dropout, TimeDistributed
from keras.optimizers import Adam

sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#1. Embed
sentence_embedding = embedding_model(sentence_input)

#2. Encode
# see keras example imdb_cnn_lstm
encoded_sentence = Dropout(0.25)(sentence_embedding)

#should output 64 feature maps with 104(narrow) or 106(wide) dims each
output_lstm = Bidirectional(LSTM(24, activation='relu', return_sequences=True))(encoded_sentence)

#3. Attend
#compute importance of sequences
#TODO dimensionen ergeben keinen sinn!
attention = TimeDistributed(Dense(1, activation='tanh'), name='importances')(output_lstm)
attention = Flatten()(attention)
attention = Activation('softmax', name="alphas")(attention)
attention = RepeatVector(48)(attention)
attention = Permute((2,1))(attention)

#elemwise mult with sequences and then weighte sum
encoded_sentence = multiply([output_lstm, attention], name="mul_with_alpha")
encoded_sentence = Lambda(lambda x: K.sum(x, axis=1, keepdims=False), output_shape=(48,), name="weighted_sum")(encoded_sentence)

lstm_simple_attention_encoding_model = Model(inputs=sentence_input, outputs=encoded_sentence)

In [79]:
lstm_simple_attention_encoding_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_75 (InputLayer)        (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 100, 300)          27438300  
_________________________________________________________________
dropout_15 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 100, 48)           62400     
_________________________________________________________________
importances (TimeDistributed (None, 100, 1)            49        
_________________________________________________________________
flatten_22 (Flatten)         (None, 100)               0         
_________________________________________________________________
alphas (Activation)          (None, 100)               0         
__________

In [89]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#embed, encode and predict
sentence1_encoding = lstm_simple_attention_encoding_model(sentence1_input)
sentence2_encoding = lstm_simple_attention_encoding_model(sentence2_input)

#predict
x = concatenate(inputs=[sentence1_encoding, sentence2_encoding])
x = Dense(512, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)
preds = Dense(4, activation='softmax')(x)

#compile the model
clip_adam = Adam(clipnorm=1.)
lstm_simple_attention_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
lstm_simple_attention_model.compile(loss='mse',
             optimizer=clip_adam,
             metrics=['acc'])

In [83]:
lstm_simple_attention_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_80 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
input_81 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
model_24 (Model)                 (None, 48)            27500749                                     
____________________________________________________________________________________________________
concatenate_31 (Concatenate)     (None, 96)            0                                            
___________________________________________________________________________________________

### Complex Attention model
Uses information about the other sentence to compute weights

In [51]:
#Sequences of length 100, each output of lstm has dim 48
sent1_encodings = Input(shape=(MAX_SEQUENCE_LENGTH,48))
sent2_encodings = Input(shape=(MAX_SEQUENCE_LENGTH,48))

last_elem_sent1 = Lambda(lambda x: x[:,MAX_SEQUENCE_LENGTH-1:MAX_SEQUENCE_LENGTH,:], output_shape=(1,48))(sent1_encodings)
last_elem_sent1 = Reshape((48,))(last_elem_sent1)

last_elem_sent2 = Lambda(lambda x: x[:,MAX_SEQUENCE_LENGTH-1:MAX_SEQUENCE_LENGTH,:], output_shape=(1,48))(sent2_encodings)
last_elem_sent2 = Reshape((48,))(last_elem_sent2)


repeat_last_elem1 = RepeatVector(MAX_SEQUENCE_LENGTH)(last_elem_sent1)
repeat_last_elem2 = RepeatVector(MAX_SEQUENCE_LENGTH)(last_elem_sent2)

concat_repr_1 = concatenate(inputs=[sent1_encodings, repeat_last_elem1], axis=2)
concat_repr_2 = concatenate(inputs=[sent2_encodings, repeat_last_elem2], axis=2)

lstm_outputs = Input(shape=(MAX_SEQUENCE_LENGTH,96))
importances = TimeDistributed(Dense(1, activation='relu'), name='importances')(lstm_outputs)
importances = Flatten()(importances)
weights = Activation('softmax')(importances)
weights_broadcasted = RepeatVector(48)(weights)
weights_broadcasted = Permute((2,1))(weights_broadcasted)

weighting = Model(inputs=[lstm_outputs], outputs=[weights_broadcasted], name="weighting_model")

weights_sent1 = weighting(concat_repr_1)
weights_sent2 = weighting(concat_repr_2)

attended_sentence1 = multiply([sent1_encodings, weights_sent1],  name="mul_with_alpha1")
attended_sentence1 = Lambda(lambda x: K.sum(x, axis=1, keepdims=False), output_shape=(48,), name="final_encoding_1")(attended_sentence1)

attended_sentence2 = multiply([sent1_encodings, weights_sent2],  name="mul_with_alpha2")
attended_sentence2 = Lambda(lambda x: K.sum(x, axis=1, keepdims=False), output_shape=(48,), name="final_encoding_2")(attended_sentence2)

attn_model = Model(inputs=[sent1_encodings, sent2_encodings], outputs=[attended_sentence1, attended_sentence2])

In [42]:
print(weighting.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_47 (InputLayer)        (None, 100, 96)           0         
_________________________________________________________________
importances (TimeDistributed (None, 100, 1)            97        
_________________________________________________________________
flatten_8 (Flatten)          (None, 100)               0         
_________________________________________________________________
activation_4 (Activation)    (None, 100)               0         
_________________________________________________________________
repeat_vector_32 (RepeatVect (None, 48, 100)           0         
_________________________________________________________________
permute_7 (Permute)          (None, 100, 48)           0         
Total params: 97
Trainable params: 97
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
print(attn_model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_45 (InputLayer)            (None, 100, 48)       0                                            
____________________________________________________________________________________________________
input_46 (InputLayer)            (None, 100, 48)       0                                            
____________________________________________________________________________________________________
lambda_34 (Lambda)               (None, 1, 48)         0                                            
____________________________________________________________________________________________________
lambda_35 (Lambda)               (None, 1, 48)         0                                            
___________________________________________________________________________________________

In [55]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#embed and encode
sentence1_embedding = lstm_sentence_encoding_model(sentence1_input)
sentence2_embedding = lstm_sentence_encoding_model(sentence2_input)

#attend
attn_sent1, attn_sent2 = attn_model(inputs=[sentence1_embedding, sentence2_embedding])

#predict
x = concatenate(inputs=[attn_sent1, attn_sent2])
x = Dense(512, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)
preds = Dense(4, activation='softmax')(x)

#compile the model
clip_adam = Adam(clipnorm=1.)
lstm_complex_attention_model = Model(inputs=[sentence1_input, sentence2_input], outputs=preds)
lstm_complex_attention_model.compile(loss='mse',
             optimizer=clip_adam,
             metrics=['acc'])

In [56]:
lstm_complex_attention_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_61 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
input_62 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
model_4 (Model)                  (None, 100, 48)       27500700                                     
____________________________________________________________________________________________________
model_11 (Model)                 [(None, 48), (None, 4 97                                           
___________________________________________________________________________________________

In [46]:
def train_model(model, batch_size = 128):
    print("Loading training and validation data...")
    f = h5py.File('../data/deep_training_data.hdf5', 'a')
    x_train = f['training_data']
    y_train = f['training_labels']
    snli_x_val = f['snli_testing_data']
    snli_y_val = f['snli_testing_labels']
    mnli_x_val = f['mnli_testing_data']
    mnli_y_val = f['mnli_testing_labels']
    print("Done.")

    print("Shape of x_train: {}".format(np.shape(x_train)))
    print("Shape of y_train: {}".format(np.shape(y_train)))
    print("Shape of snli_x_val: {}".format(np.shape(snli_x_val)))
    print("Shape of snli_y_val: {}".format(np.shape(snli_y_val)))
    print("Shape of mnli_x_val: {}".format(np.shape(mnli_x_val)))
    print("Shape of mnli_y_val: {}".format(np.shape(mnli_y_val)))

    samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of batches until epoch is finished
    nb_val_samples = int(np.floor(np.shape(snli_y_val)[0] / batch_size))

    print("Training the model...")

    model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
              nb_epoch=10,
              steps_per_epoch = samples_per_epoch,
              validation_data=val_data_generator(snli_x_val, snli_y_val, num_batches=nb_val_samples, batch_size= batch_size),
              nb_val_samples = nb_val_samples,
              verbose=2)
    print("Done.")

    f.close()
    return model

## Training 3 Models for Comparison of the Attention Models

In [85]:
baseline_model = train_model(baseline_model)
baseline_model.save_weights(HARD_DRIVE_MODEL_PATH + "baseline_attn.hdf")

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Training the model...
Epoch 1/10




1583s - loss: 0.0638 - acc: 0.8330 - val_loss: 0.1123 - val_acc: 0.7048
Epoch 2/10
1582s - loss: 0.0616 - acc: 0.8401 - val_loss: 0.1130 - val_acc: 0.7016
Epoch 3/10
1584s - loss: 0.0595 - acc: 0.8462 - val_loss: 0.1136 - val_acc: 0.7065
Epoch 4/10
1584s - loss: 0.0577 - acc: 0.8517 - val_loss: 0.1163 - val_acc: 0.7018
Epoch 5/10
1584s - loss: 0.0561 - acc: 0.8565 - val_loss: 0.1156 - val_acc: 0.7056
Epoch 6/10
1584s - loss: 0.0546 - acc: 0.8608 - val_loss: 0.1169 - val_acc: 0.7021
Epoch 7/10
1583s - loss: 0.0531 - acc: 0.8655 - val_loss: 0.1191 - val_acc: 0.7014
Epoch 8/10
1584s - loss: 0.0517 - acc: 0.8696 - val_loss: 0.1191 - val_acc: 0.7032
Epoch 9/10
1585s - loss: 0.0505 - acc: 0.8730 - val_loss: 0.1217 - val_acc: 0.6933
Epoch 10/10
1583s - loss: 0.0493 - acc: 0.8765 - val_loss: 0.1225 - val_acc: 0.6953
Done.


In [86]:
score, acc = eval_model(baseline_model, "snli")
print("SNLI-results: score = {}, acc = {}".format(score, acc))
score, acc = eval_model(baseline_model, "mnli")
print("MNLI-results: score = {}, acc = {}".format(score, acc))

Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.12246769265486644, Accuracy: 0.6953125
SNLI-results: score = 0.12246769265486644, acc = 0.6953125
Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.1671203825818865, Accuracy: 0.5833675986842105
MNLI-results: score = 0.1671203825818865, acc = 0.5833675986842105


In [90]:
lstm_simple_attention_model = train_model(lstm_simple_attention_model)
lstm_simple_attention_model.save_weights(HARD_DRIVE_MODEL_PATH + "lstm_simple_attention_model.hdf")

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Training the model...




Epoch 1/10
1571s - loss: 0.1022 - acc: 0.6980 - val_loss: 0.1049 - val_acc: 0.7026
Epoch 2/10
1570s - loss: 0.0965 - acc: 0.7179 - val_loss: 0.0957 - val_acc: 0.7316
Epoch 3/10
1566s - loss: 0.0947 - acc: 0.7241 - val_loss: 0.0928 - val_acc: 0.7456
Epoch 4/10
1565s - loss: 0.0937 - acc: 0.7277 - val_loss: 0.0924 - val_acc: 0.7446
Epoch 5/10
1565s - loss: 0.0927 - acc: 0.7309 - val_loss: 0.0897 - val_acc: 0.7490
Epoch 6/10
1564s - loss: 0.0919 - acc: 0.7337 - val_loss: 0.0902 - val_acc: 0.7483
Epoch 7/10
1565s - loss: 0.0912 - acc: 0.7363 - val_loss: 0.0889 - val_acc: 0.7490
Epoch 8/10
1568s - loss: 0.0906 - acc: 0.7382 - val_loss: 0.0888 - val_acc: 0.7532
Epoch 9/10
1567s - loss: 0.0902 - acc: 0.7398 - val_loss: 0.0884 - val_acc: 0.7525
Epoch 10/10
1565s - loss: 0.0898 - acc: 0.7413 - val_loss: 0.0870 - val_acc: 0.7555
Done.


In [91]:
score, acc = eval_model(lstm_simple_attention_model, "snli")
print("SNLI-results: score = {}, acc = {}".format(score, acc))
score, acc = eval_model(lstm_simple_attention_model, "mnli")
print("MNLI-results: score = {}, acc = {}".format(score, acc))

Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.08703897358515324, Accuracy: 0.7555088141025641
SNLI-results: score = 0.08703897358515324, acc = 0.7555088141025641
Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.11487559248742304, Accuracy: 0.6592310855263158
MNLI-results: score = 0.11487559248742304, acc = 0.6592310855263158


In [92]:
lstm_complex_attention_model = train_model(lstm_complex_attention_model)
lstm_complex_attention_model.save_weights(HARD_DRIVE_MODEL_PATH + "lstm_complex_attention_model.hdf")

Loading training and validation data...
Done.
Shape of x_train: (2, 941317, 100)
Shape of y_train: (941317, 4)
Shape of snli_x_val: (2, 10000, 100)
Shape of snli_y_val: (10000, 4)
Shape of mnli_x_val: (2, 9897, 100)
Shape of mnli_y_val: (9897, 4)
Training the model...




Epoch 1/10
1561s - loss: 0.1284 - acc: 0.5914 - val_loss: 0.1358 - val_acc: 0.5839
Epoch 2/10
1558s - loss: 0.1189 - acc: 0.6319 - val_loss: 0.1330 - val_acc: 0.5917
Epoch 3/10
1563s - loss: 0.1162 - acc: 0.6432 - val_loss: 0.1321 - val_acc: 0.5898
Epoch 4/10
1561s - loss: 0.1143 - acc: 0.6518 - val_loss: 0.1313 - val_acc: 0.6017
Epoch 5/10
1560s - loss: 0.2011 - acc: 0.5368 - val_loss: 0.3316 - val_acc: 0.3366
Epoch 6/10
1559s - loss: 0.1474 - acc: 0.5719 - val_loss: 0.1259 - val_acc: 0.6266
Epoch 7/10
1559s - loss: 0.1116 - acc: 0.6629 - val_loss: 0.1274 - val_acc: 0.6171
Epoch 8/10
1559s - loss: 0.1100 - acc: 0.6692 - val_loss: 0.1267 - val_acc: 0.6201
Epoch 9/10
1560s - loss: 0.1084 - acc: 0.6758 - val_loss: 0.1262 - val_acc: 0.6189
Epoch 10/10
1560s - loss: 0.1073 - acc: 0.6806 - val_loss: 0.1255 - val_acc: 0.6190
Done.


In [93]:
score, acc = eval_model(lstm_complex_attention_model, "snli")
print("SNLI-results: score = {}, acc = {}".format(score, acc))
score, acc = eval_model(lstm_complex_attention_model, "mnli")
print("MNLI-results: score = {}, acc = {}".format(score, acc))

Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.12551427632570267, Accuracy: 0.6189903846153846
SNLI-results: score = 0.12551427632570267, acc = 0.6189903846153846
Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.1336560417947016, Accuracy: 0.583264802631579
MNLI-results: score = 0.1336560417947016, acc = 0.583264802631579


In [59]:
#TODO come up with intelligent combination of sentences (e.g. rotation, but also think of other ways)

In [50]:
def rotate_idxs(index_list):
    num_rots = len(index_list)
    idxs = []
    for i in range(num_rots):
        idxs += [index_list[len(index_list) - 1]] + index_list[:len(index_list) - 1]
    return idxs, num_rots
    

In [25]:
rotate_idxs([1,2,3])

([3, 1, 2, 3, 1, 2, 3, 1, 2], 3)

In [49]:
def theano_comb_concat():
    a = T.ltensor3('sent1_encodings')
    b = T.ltensor3('sent2_encodings')
    c = T.lvector('cols_first_tensor')
    c2 = T.lvector('cols_second_tensor')
    results, updates = theano.scan(fn = lambda c,c2,a,b: T.concatenate([a[:,c,:], b[:,c2,:]]),
                                  outputs_info=None,
                                   sequences = [c,c2],
                                   non_sequences = [a,b]
                                  )
    f = theano.function([a,b,c,c2], results)
    return f

In [41]:
def combinatorial_concat(a,b,dim):
    idx1 = list(range(dim))
    idx2, num_rots = rotate_idxs(idx1)
    idx1 = idx1 * num_rots
    return theano_comb_concat(a,b,idx1,idx2)

In [58]:
from keras.engine.topology import Layer
import numpy as np

#TODO: 
# - finish CombinatorialConcat Layer
# - build into attention model
# - compute weights, attend, train!

class CombinatorialConcat(Layer):

    def __init__(self, dim, **kwargs):
        self.dim = dim
        super(CombinatorialConcat, self).__init__(**kwargs)

    def build(self, input_shape):
        super(CombinatorialConcat, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs):
        if not isinstance(inputs, list) or len(inputs) != 2:
            raise ValueError('A `CombinatorialConcatenate` layer should be called '
                             'on a list containing two inputs.')
        f = theano_comb_concat()
        
        a = inputs[0]
        b = inputs[1]
        
        print(inputs)
        
        idx1 = list(range(self.dim))
        idx2, num_rots = rotate_idxs(idx1)
        idx1 = idx1 * num_rots
        
        return f(a,b,idx1, idx2)

    def compute_output_shape(self, input_shape):
        #batch_size, combinatorial possibilities, len of two concatenated vectors 
        return (input_shape[0], input_shape[1]**2, input_shape[2]*2)

In [95]:
#3. Attention model

sent1_encodings = Input(shape=(64,48))
sent2_encodings = Input(shape=(64,48))

#sent1_permute = Permute((2,1))(sent1_encodings)
sent2_permute = Permute((2,1))(sent2_encodings)

#this should be all possible combinations, not just 1:1

#(64,48) x (48,64) = (64,64) -> mults of all possible sentences combinations
concat_repr = Lambda(lambda x: K.dot(x[0],x[1]), output_shape=(64,64), name='dot_combination')([sent1_encodings, sent2_permute])

att_sent1 = Lambda(lambda x:K.max(x, axis=))


concat_repr = Flatten()(concat_repr)
concat_repr = Reshape((4096,1))(concat_repr)
#concat_reprs = merge(inputs = [sent1_encodings, sent2_encodings], mode='concat', concat_axis=-1)
#concat_reprs = Permute((2,1))(concat_reprs)


#computation of weights
importances = TimeDistributed(Dense(1, activation='relu'), name="importances")(concat_repr)
weights = Flatten()(importances)
weights = Activation('softmax', name='weights')(weights)

#compute weighted sum of concatenated representations
weights_broadcasted = RepeatVector(96)(weights)
weights_broadcasted = Permute((2,1))(weights_broadcasted)

#elemwise mult with sequences and then weighte sum
encoded_sentence = merge([output_lstm, weights_broadcasted], mode='mul', name="mul_with_alpha")
encoded_sentence = Lambda(lambda x: K.sum(x, axis=1, keepdims=False), output_shape=(64,), name="weighted_sum")(encoded_sentence)

attn_model = Model(inputs=[sent1_encodings, sent2_encodings], outputs=[attention])

NameError: name 'attention' is not defined

In [94]:
attn_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_64 (InputLayer)            (None, 64, 48)        0                                            
____________________________________________________________________________________________________
input_63 (InputLayer)            (None, 64, 48)        0                                            
____________________________________________________________________________________________________
permute_33 (Permute)             (None, 48, 64)        0                                            
____________________________________________________________________________________________________
dot_combination (Lambda)         (None, 64, 64)        0                                            
___________________________________________________________________________________________

In [22]:
print("saving recurrent model...")
#lstm_model.save_weights(LOCAL_MODEL_PATH + 'lstm_model_weights_attention.hdf5')
lstm_model.save_weights(HARD_DRIVE_MODEL_PATH + 'lstm_model_weights_attention.hdf5')
print("done.")

saving recurrent model...
done.


In [23]:
print("loading model weights...") 
lstm_model.load_weights(HARD_DRIVE_MODEL_PATH + '/lstm_model_weights_attention.hdf5')
print("done.")

loading model weights...
done.


In [15]:
score, acc = eval_model(lstm_model, "snli");
print("SNLI-results: score = {}, acc = {}".format(score, acc))
score, acc = eval_model(lstm_model, "mnli");
print("MNLI-results: score = {}, acc = {}".format(score, acc))

Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.09323738706417573, Accuracy: 0.7357772435897436
SNLI-results: score = 0.09323738706417573, acc = 0.7357772435897436
Loading training and validation data...
Done.
Evaluating on the snli data
Score: 0.11954225795833688, Accuracy: 0.6439144736842105
MNLI-results: score = 0.11954225795833688, acc = 0.6439144736842105


Not too bad! Next up:
- improving the convolution architecture (num filters, maxpooling size)
- trying out lstms for encoding the sentences
- testing with the actual repeval dataset

A recursive network implementation in theano: https://github.com/ofirnachum/tree_rnn

# Error Analysis

Analyze which sentences are classified wrong and why:
- print out target, prediction, sentence 1 and sentence 2 of wrongly classified samples (save to file)

In [None]:
print("Loading word index...")
word_index = pickle.load(open("../data/word_index.p", "rb"))
print("Done.")

In [None]:
print("Loading training and validation data...")
f = h5py.File('../data/deep_training_data.hdf5', 'a')
snli_x_val = f['snli_testing_data']
snli_y_val = f['snli_testing_labels']
mnli_x_val = f['mnli_testing_data']
mnli_y_val = f['mnli_testing_labels']
print("Done.")

print("Shape of snli samples: {}".format(np.shape(snli_x_val)))
print("Shape of snli targets: {}".format(np.shape(snli_y_val)))
print("Shape of mnli samples: {}".format(np.shape(mnli_x_val)))
print("Shape of mnli samples: {}".format(np.shape(mnli_y_val)))

wrong_classifications = []
correct_classifications = []


index_to_word = {v: k for k, v in word_index.items()} #maps from integer word indices to the words themselves
label_to_word = {0: 'neutral', 1: 'contradiction', 2:'entailment', 3:'other'}

num_samples = np.shape(mnli_x_val)[1]
                 
for i in range(num_samples):
    sequence1 = np.reshape(mnli_x_val[0,i,:], (1,100))
    sequence2 = np.reshape(mnli_x_val[1,i,:], (1,100))
    y_pred = np.argmax(conv_model.predict([sequence1, sequence2]))
    y_true = np.argmax(mnli_y_val[i])
    #print(y_pred);print(y_true)
    
    sentence1 = ""
    for index in sequence1[0]:
        if index != 0: 
            sentence1 += index_to_word[index] + " "
    sentence2 = ""
    for index in sequence2[0]:
        if index != 0: 
            sentence2 += index_to_word[index] + " "
            
    if y_pred != y_true:            
        wrong_classifications.append((y_true, y_pred, sentence1, sentence2))
    elif y_pred == y_true:
        correct_classifications.append((y_true, y_pred, sentence1, sentence2))
    
print("Found {} correct and {} false classifications.".format(len(correct_classifications),len(wrong_classifications)))

In [None]:
with open('../results/correct_classifications.txt', "w") as file:
    for y_true, y_false, sentence1, sentence2 in correct_classifications:
        file.write('----------------------\nTARGET: {}, PRED: {}\nSENTENCE1: {}\nSENTENCE2: {}\n'.format(label_to_word[y_true], label_to_word[y_false], sentence1, sentence2))


with open('../results/classification_errors.txt', "w") as file:
    for y_true, y_false, sentence1, sentence2 in wrong_classifications:
        file.write('----------------------\nTARGET: {}, PRED: {}\nSENTENCE1: {}\nSENTENCE2: {}\n'.format(label_to_word[y_true], label_to_word[y_false], sentence1, sentence2))

                 
f.close()

# How does a sentence need to change for a change in the output of the model?
TODO: 
- write a function that turns one sentence into the right format and then computes the output of the model
- test the different things:
    - permutations
    - negation
    - exclude trigrams from the sentence
- do this for 100 correctly classified and 100 wrongly classified

In [None]:
print("Loading tokenizer...")
tokenizer = pickle.load(open("../data/tokenizer.p", "rb"))
print("Done.")

In [None]:
def predict_strings(string1, string2, tokenizer, model):
    sequences = tokenizer.texts_to_sequences([string1, string2])
    padded_sequences = simple_padding(sequences)
    input_1 = np.reshape(np.array(padded_sequences[0]), (1,100))
    input_2 = np.reshape(np.array(padded_sequences[1]), (1,100))
    output_model = model.predict([input_1, input_2])
    if len(output_model) == 1:
        y_pred = np.argmax(output_model)
        return y_pred
    elif len(output_model) == 3:
        y_pred = np.argmax(output_model[0])
        return y_pred, output_model[1], output_model[2]
    
label_conv_dict = {
    'neutral' : 0,
    'contradiction' : 1,
    'entailment' : 2,
    'other' : 3,
    0 : 'neutral',
    1 : 'contradiction',
    2 : 'entailment',
    3 : 'other'
}

In [None]:
label_conv_dict[predict_strings("john not love mary", "john knows mary", tokenizer, conv_model)]

In [None]:
#small test of the alternative conv model
y_true, sentence1_encoding, sentence2_encoding = predict_strings("john not love mary", "john knows mary", tokenizer, alt_conv_model)
print(label_conv_dict[y_true])
print(sentence1_encoding.shape)

In [None]:
import random
random.choice(correct_classifications)

In [None]:
from nltk import word_tokenize, ngrams
import re
def exclude_trigrams(string):
    toks = word_tokenize(string)
    trigrams = ngrams(toks, 3)
    corrupted_list = []
    for tri in trigrams:
        reduced_string = string
        for i in range(0, 3):
            regex = r"\b({})\b".format(tri[i])
            reduced_string = re.sub(regex, '', reduced_string)
        corrupted_list.append(reduced_string)
    return corrupted_list


In [None]:
t = "this is just a test sentence, it does not really mean anything"
print(exclude_trigrams(t))

In [None]:
import random

def apply_corruption_to_sample(y_true, y_pred, sentence1, sentence2):
    results = []  #a list of target, pred, sentence1, sentence2 with corruptions
    #exlclude all possible trigrams
    corrupted_sentence1 = exclude_trigrams(sentence1)
    corrupted_sentence2 = exclude_trigrams(sentence2)
    for corrupted_sentence in corrupted_sentence1:
        corrupted_pred = predict_strings(corrupted_sentence, sentence2, tokenizer, conv_model)
        changed = False
        if corrupted_pred != y_pred:
            changed = True
        results.append((y_true, corrupted_pred, changed, corrupted_sentence, sentence2))
    for corrupted_sentence in corrupted_sentence2:
        corrupted_pred = predict_strings(sentence1, corrupted_sentence, tokenizer, conv_model)
        changed = False
        if corrupted_pred != y_pred:
            changed = True
        results.append((y_true, corrupted_pred, changed, sentence1, corrupted_sentence))
    
    return {'target' : y_true, 'pred' : y_pred, 'sentence1':sentence1, 'sentence2':sentence2,
                         'corruptions' : results}
NUM_SAMPLES = 100
corruption_results = []

for i in range(NUM_SAMPLES):
    #take NUM_SAMPLES correct and NUM_SAMPLES wrong classifications
    y_true, y_pred, sentence1, sentence2 = random.choice(correct_classifications)
    corruption_results.append(apply_corruption_to_sample(y_true, y_pred, sentence1, sentence2))
    
    #the same for wrong classifications
    y_true, y_pred, sentence1, sentence2 = random.choice(wrong_classifications)
    corruption_results.append(apply_corruption_to_sample(y_true, y_pred, sentence1, sentence2))
    
print(len(corruption_results))
print(corruption_results[0])

In [None]:
#print results to file
with open('../results/corrupted_sentences.txt', "w") as file:
    for sample_results in corruption_results:
        y_true = sample_results['target']
        y_pred = sample_results['pred']
        sentence1 = sample_results['sentence1']
        sentence2 = sample_results['sentence2']
        file.write('########################################\nTARGET: {}, PRED: {}\nSENTENCE1: {}\nSENTENCE2: {}\n'.format(label_to_word[y_true], label_to_word[y_pred], sentence1, sentence2))
        for (y_true, corrupted_pred, changed, corrupted_sentence, sentence2) in sample_results['corruptions']:
            file.write('\t--------\n\tTARGET: {}, PRED: {}, CHANGED = {}\n\tSENTENCE1: {}\n\tSENTENCE2: {}\n'.format(label_to_word[y_true], label_to_word[corrupted_pred], changed, corrupted_sentence, sentence2))

## Exploring spaCy dependency parsing

In [None]:
import spacy
nlp = spacy.load('en')

In [None]:
def parse_depency(string):
    doc = nlp(string)
    dep_labels = []
    for token in doc:
        #print(token, token.dep_, token.pos_)
        dep_labels.append((token, token.dep_, token.pos_))
    return dep_labels

In [None]:
def naive_negation(string):
    parsed_string = parse_depency(string)
    result = ''
    for tok, dep, pos in parsed_string:
        result += tok.text + ' '
        if pos == 'VERB':
            result += 'not '
    return result

In [None]:
parse_depency('i am fabian and i like butter')

In [None]:
naive_negation('i am fabian and i like butter')

In [None]:
import scipy as scp

sentence1 = "i am fabian and i like butter"
y_true, sentence1_encoding, sentence2_encoding = predict_strings(sentence1, sentence1, tokenizer, alt_conv_model)
print("Comparing '{}' with '{}'".format(sentence1,sentence1))
print('predicted target: ',label_conv_dict[y_true])
print(scp.spatial.distance.cosine(sentence1_encoding, sentence2_encoding))
print()

sentence2 = "i am fabian and i not like butter"
y_true, sentence1_encoding, sentence2_encoding = predict_strings(sentence2, sentence1, tokenizer, alt_conv_model)
print("Comparing '{}' with '{}'".format(sentence1,sentence2))
print('predicted target: ',label_conv_dict[y_true])
print(scp.spatial.distance.cosine(sentence1_encoding, sentence2_encoding))
print()

sentence3 = "some test sentence for comparison"
y_true, sentence1_encoding, sentence2_encoding = predict_strings(sentence1, sentence3, tokenizer, alt_conv_model)
print("Comparing '{}' with '{}'".format(sentence1,sentence3))
print('predicted target: ',label_conv_dict[y_true])
print(scp.spatial.distance.cosine(sentence1_encoding, sentence2_encoding))


This looks like the model does what it should. As the sentence is negated, the predicted entailment changes, and the distance between the sentence encodings also increases but is comparably small regarding the distance to a completely different sentence encoding. 

Even more interesting to me is that in the third example, the model votes for entailment although the difference in embedding space is much higher than in example three. Seems like the classification model (just fully connected layers) is too weak for the task. 