# Repeval 2017 Exporations

In [1]:
from gensim.models import KeyedVectors

In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.layers import merge
from keras.utils.np_utils import to_categorical
import h5py


Using Theano backend.


First, load pre-trained word embeddings. Here, the ones from Mikolov using the word2vec toolkit

In [3]:
W2V_BINARY_PATH = '../data/GoogleNews-vectors-negative300.bin'
vectorspace = KeyedVectors.load_word2vec_format(W2V_BINARY_PATH, binary=True)

Load the data. Starting here with the SNLI Corpus until further data are available for the repeval2017 task.

In [12]:
data_path = '../data/snli_1.0/'
data_frame = pd.read_csv(data_path + 'snli_1.0_train.txt', sep='\t')
gold_labels = data_frame.gold_label.tolist()
sentences1 = data_frame.sentence1.tolist()
sentences2 = data_frame.sentence2.tolist()

Use only part of the data for testing the model:

In [13]:
num_samples = 100000
samples = [(str(sentences1[i]), str(sentences2[i]), str(gold_labels[i])) for i in range(num_samples)]
print("Using {} samples from the dataset".format(num_samples))

Using 100000 samples from the dataset


Use h5py to store the data. This helps to use less RAM for the training data in training the models.

## Some hdf5 tests

In [5]:
import h5py

In [6]:
data = np.asarray(samples, dtype=object)
string_dt = h5py.special_dtype(vlen=str)
f = h5py.File('../data/deep_training_data.hdf5', 'w')
t_data = f.create_dataset('training_data', data=data, dtype=string_dt)

In [7]:
f = h5py.File('../data/deep_training_data.hdf5', 'r')
samples = f["training_data"]
np.shape(samples)

(10000, 3)

int64
<class 'h5py._hl.dataset.Dataset'>
[[[1 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]]]


## Training a model with some data

Preprocessing the data to include the word embeddings into the model is done after this keras-example:
https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py

In [4]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
VALIDATION_SPLIT = 0.2

In [5]:
#This is a reimplementation of pad_sequences using hdf5 to be able to work on large amounts of data
def pad_sequences_hdf5(sequences, hdf5_file, dname='padded_sequences', maxlen=None, dtype='int32',
                  padding='pre', truncating='pre', value=0.):
    """Pads each sequence to the same length (length of the longest sequence).
    If maxlen is provided, any sequence longer
    than maxlen is truncated to maxlen.
    Truncation happens off either the beginning (default) or
    the end of the sequence.
    Supports post-padding and pre-padding (default).
    # Arguments
        sequences: list of lists where each element is a sequence
        maxlen: int, maximum length
        dtype: type to cast the resulting sequence.
        padding: 'pre' or 'post', pad either before or after each sequence.
        truncating: 'pre' or 'post', remove values from sequences larger than
            maxlen either in the beginning or in the end of the sequence
        value: float, value to pad the sequences to the desired value.
    # Returns
        x: numpy array with dimensions (number_of_sequences, maxlen)
    # Raises
        ValueError: in case of invalid values for `truncating` or `padding`,
            or in case of invalid shape for a `sequences` entry.
    """
    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            print("Sample shape found: {}".format(sample_shape))
            break
    
    if not dname in hdf5_file:
        hdf5_file.create_dataset(dname, ((num_samples, maxlen) + sample_shape), dtype=dtype)
    else:
        del hdf5_file[dname]
        hdf5_file.create_dataset(dname, ((num_samples, maxlen) + sample_shape), dtype=dtype)
    x = hdf5_file[dname]
    #print("num_samples: {}".format(num_samples))
    #print("maxlen: {}".format(maxlen))
    #print("Num sequences: {}".format(len(sequences)))
    #print("sample_shape: {}".format(sample_shape))
    #print("Shape of x: {}".format(x.shape))
    #print("dtype of x: {}".format(x.dtype))
    #x[:num_samples,:maxlen] = value
    #x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    x[:,:] = value
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

In [14]:
sentences1 = [sentence for sentence, sentence2, label in samples]
sentences2 = [sentence2 for sentence, sentence2, label in samples]
labels = [label for sentence, sentence2, label in samples] 
numeric_labels = []
for label in labels:
    if label == 'neutral':
        numeric_labels.append(0)
    elif label == 'contradiction':
        numeric_labels.append(1)
    elif label == 'entailment':
        numeric_labels.append(2)
    else:
        numeric_labels.append(3)
        
tokenizer = text.Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(sentences1 + sentences2)
sequences1 = tokenizer.texts_to_sequences(sentences1)
sequences2 = tokenizer.texts_to_sequences(sentences2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

print("shape of sequences1: {}".format(np.shape(sequences1)))
print("shape of sequences2: {}.".format(np.shape(sequences2)))

#TODO: reimplement pad_sequences using h5py?
#TODO: just split the padding of the sequences into chunks?

hdf5_file = h5py.File('../data/test.hdf5', 'a')

data1 = pad_sequences_hdf5(sequences1, hdf5_file=hdf5_file, maxlen=MAX_SEQUENCE_LENGTH)
data2 = pad_sequences_hdf5(sequences2, hdf5_file=hdf5_file, maxlen=MAX_SEQUENCE_LENGTH)

print("shape of data1: {}.".format(np.shape(data1)))
print("shape of data21: {}.".format(np.shape(data2)))

data = hdf5_file.create_dataset('complete_data', ((2,) + np.shape(data1)))

labels = to_categorical(np.asarray(numeric_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[1])
np.random.shuffle(indices)
data = data[:,indices,:]
labels = labels[indices,:]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[1])

x_train = data[:,:-num_validation_samples,:]
y_train = labels[:-num_validation_samples]
x_val = data[:,-num_validation_samples:,:]
y_val = labels[-num_validation_samples:]

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of x_val: {}".format(np.shape(x_val)))
print("Shape of y_val: {}".format(np.shape(y_val)))

print("Storing training and test data to hdf5...")

f = h5py.File('../data/deep_training_data.hdf5', 'w')
x_train = f.create_dataset('training_data', data=x_train)
y_train = f.create_dataset('training_data', data=y_train)
x_val = f.create_dataset('training_data', data=x_val)
y_val = f.create_dataset('training_data', data=y_val)
f.close()
print("Done.")


Found 16921 unique tokens.
shape of sequences1: (100000,)
shape of sequences2: (100000,).
Sample shape found: ()
num_samples: 100000
maxlen: 100
Num sequences: 100000
sample_shape: ()
Shape of x: (100000, 100)
dtype of x: int32
Sample shape found: ()
num_samples: 100000
maxlen: 100
Num sequences: 100000
sample_shape: ()
Shape of x: (100000, 100)
dtype of x: int32


In [20]:
t = np.ones((11,2))
print((2,) + np.shape(t))

print(np.shape(np.asarray([t,t])))

(2, 11, 2)
(2, 11, 2)


In [66]:
sentences1 = [sentence for sentence, sentence2, label in samples]
sentences2 = [sentence2 for sentence, sentence2, label in samples]
labels = [label for sentence, sentence2, label in samples] 
numeric_labels = []
for label in labels:
    if label == 'neutral':
        numeric_labels.append(0)
    elif label == 'contradiction':
        numeric_labels.append(1)
    elif label == 'entailment':
        numeric_labels.append(2)
    else:
        numeric_labels.append(3)
        
tokenizer = text.Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(sentences1 + sentences2)
sequences1 = tokenizer.texts_to_sequences(sentences1)
sequences2 = tokenizer.texts_to_sequences(sentences2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

print("shape of sequences1: {}. {}".format(np.shape(sequences1), np.shape(sequences1[0])))
print("shape of sequences2: {}.".format(np.shape(sequences2)))

#TODO: reimplement pad_sequences using h5py?
#TODO: just split the padding of the sequences into chunks?

data1 = sequence.pad_sequences(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
data2 = sequence.pad_sequences(sequences2, maxlen=MAX_SEQUENCE_LENGTH)



print("shape of data1: {}.".format(np.shape(data1)))
print("shape of data21: {}.".format(np.shape(data2)))

data = np.asarray([data1, data2])

labels = to_categorical(np.asarray(numeric_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[1])
np.random.shuffle(indices)
data = data[:,indices,:]
labels = labels[indices,:]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[1])

x_train = data[:,:-num_validation_samples,:]
y_train = labels[:-num_validation_samples]
x_val = data[:,-num_validation_samples:,:]
y_val = labels[-num_validation_samples:]

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of x_val: {}".format(np.shape(x_val)))
print("Shape of y_val: {}".format(np.shape(y_val)))

print("Storing training and test data to hdf5...")

f = h5py.File('../data/deep_training_data.hdf5', 'w')
x_train = f.create_dataset('training_data', data=x_train)
y_train = f.create_dataset('training_data', data=y_train)
x_val = f.create_dataset('training_data', data=x_val)
y_val = f.create_dataset('training_data', data=y_val)
f.close()
print("Done.")

Found 34369 unique tokens.
shape of sequences1: (550152,). (11,)
shape of sequences2: (550152,).


MemoryError: 

In [90]:
t = np.ones((10,10))
a = np.shape(t)
print(len(a))
for i in a: print(i)

2
10
10


In [None]:
f = h5py.File('../data/deep_training_data.hdf5', 'r')
x_train = f["x_train"]
y_train = f["y_train"]
x_val = f["x_val"]
y_val = f["y_val"]
np.shape(samples)

In [None]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM)) #TODO: is this correct? In the example, the first dimension is
# num_words, this, however, throws an error when populating the embedding matrix (because the word-indices start at 1, not at 0)
print("shape of embedding matrix: {}".format(np.shape(embedding_matrix)))

for word, i in word_index.items():
    #print("{}: {}".format(i,word))
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = None
    
    try:
        embedding_vector = vectorspace[word]
    except:
        pass
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) #set trainable = True to enable training of the embeddings to the task at hand

The model for encoding the sentences:

In [20]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#encode first sentence
embedded_sentence1 = embedding_layer(sentence1_input)
x = Conv1D(128, 5, activation='relu')(embedded_sentence1)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence1 = MaxPooling1D(35)(x)

#encode second sentence
embedded_sentence2 = embedding_layer(sentence2_input)
y = Conv1D(128, 5, activation='relu')(embedded_sentence2)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
encoded_sentence2 = MaxPooling1D(35)(y)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [encoded_sentence1, encoded_sentence2], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model1 = Model(input=[sentence1_input, sentence2_input], output=preds)
model1.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [21]:
model1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 300)     2482200     input_11[0][0]                   
                                                                   input_12[0][0]                   
____________________________________________________________________________________________________
convolution1d_10 (Convolution1D) (None, 996, 128)      192128      embedding_1[3][0]       

The same but using the same architecture for embedding both sentences, so that it will profit from all examples:

In [15]:
#MAX_SEQUENCE_LENGTH = 128 #just for testing 
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_layer(sentence_input)
x = Conv1D(128, 5, activation='relu')(sentence_embedding)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence = MaxPooling1D(35)(x)

sentence_embedding_model = Model(input=sentence_input, output=encoded_sentence)


#sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
#not sure whether jsut using sentence input multiple times works
# or you need to specify different inputs explicitly
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')


sentence1_embedding = sentence_embedding_model(sentence1_input)
sentence2_embedding = sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model = Model(input=[sentence1_input, sentence2_input], output=preds)
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [16]:
sentence_embedding_model.summary()
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 300)     2482200     input_4[0][0]                    
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 996, 128)      192128      embedding_1[1][0]                
____________________________________________________________________________________________________
maxpooling1d_4 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_4[0][0]            
___________________________________________________________________________________________

In [17]:
#TODO use generators more wisely, introduce batching, look for fitting sizes
def training_data_generator(x_train, y_train, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_train[0,i:i+batch_size,:],x_train[1,i:i+batch_size,:]], y_train[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output

def val_data_generator(x_test, y_test, num_batches, batch_size):
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_test[0,i:i+batch_size,:],x_test[1,i:i+batch_size,:]], y_test[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        yield gen_output
        
batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of steps until epoch is finished
nb_val_samples = int(np.floor(np.shape(y_val)[0] / batch_size))

model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=10,
          samples_per_epoch = samples_per_epoch * batch_size,
          validation_data=val_data_generator(x_val, y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples * batch_size,
          verbose=2)

Epoch 1/5
55s - loss: 1.0847 - acc: 0.4208 - val_loss: 1.0429 - val_acc: 0.4488
Epoch 2/5
55s - loss: 1.0090 - acc: 0.4858 - val_loss: 1.0360 - val_acc: 0.4541
Epoch 3/5
56s - loss: 0.9670 - acc: 0.5192 - val_loss: 1.0444 - val_acc: 0.4660
Epoch 4/5
57s - loss: 0.9214 - acc: 0.5544 - val_loss: 1.0712 - val_acc: 0.4624
Epoch 5/5
56s - loss: 0.8607 - acc: 0.6040 - val_loss: 1.0750 - val_acc: 0.4914


<keras.callbacks.History at 0x7fb477c02a58>

In [13]:
gen = training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size)
print(np.shape(next(gen)[0]))

(2, 128, 1000)


In [36]:
??Model.fit_generator


And there we have overfitting! 
TODO:
- get it working on the complete dataset (not just 20000)
- try different models
- prevent overfitting