# Repeval 2017 Exporations

In [1]:
from gensim.models import KeyedVectors

In [2]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.layers import merge
from keras.utils.np_utils import to_categorical

Using Theano backend.
Using gpu device 0: GeForce GTX 1050 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5110)


First, load pre-trained word embeddings. Here, the ones from Mikolov using the word2vec toolkit

In [6]:
W2V_BINARY_PATH = '../data/GoogleNews-vectors-negative300.bin'
vectorspace = KeyedVectors.load_word2vec_format(W2V_BINARY_PATH, binary=True)

Load the data. Starting here with the SNLI Corpus until further data are available for the repeval2017 task.

In [7]:
data_path = '../data/snli_1.0/'
data_frame = pd.read_csv(data_path + 'snli_1.0_train.txt', sep='\t')
gold_labels = data_frame.gold_label.tolist()
sentences1 = data_frame.sentence1.tolist()
sentences2 = data_frame.sentence2.tolist()

Use only part of the data for testing the model:

In [8]:
#num_samples = len(sentences1)
num_samples = 20000
samples = [(sentences1[i], sentences2[i], gold_labels[i]) for i in range(num_samples)]
print("Using {} samples from the dataset".format(num_samples))

Using 20000 samples from the dataset


## Training a model with some data

Preprocessing the data to include the word embeddings into the model is done after this keras-example:
https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py

In [9]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2

In [10]:
sentences1 = [sentence for sentence, sentence2, label in samples]
sentences2 = [sentence2 for sentence, sentence2, label in samples]
labels = [label for sentence, sentence2, label in samples] 
numeric_labels = []
for label in labels:
    if label == 'neutral':
        numeric_labels.append(0)
    elif label == 'contradiction':
        numeric_labels.append(1)
    elif label == 'entailment':
        numeric_labels.append(2)
    else:
        numeric_labels.append(3)
        
tokenizer = text.Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(sentences1 + sentences2)
sequences1 = tokenizer.texts_to_sequences(sentences1)
sequences2 = tokenizer.texts_to_sequences(sentences2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data1 = sequence.pad_sequences(sequences1, maxlen=MAX_SEQUENCE_LENGTH)
data2 = sequence.pad_sequences(sequences2, maxlen=MAX_SEQUENCE_LENGTH)

data = np.asarray([data1, data2])

labels = to_categorical(np.asarray(numeric_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[1])
np.random.shuffle(indices)
data = data[:,indices,:]
labels = labels[indices,:]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[1])

x_train = data[:,:-num_validation_samples,:]
y_train = labels[:-num_validation_samples]
x_val = data[:,-num_validation_samples:,:]
y_val = labels[-num_validation_samples:]

print("Shape of x_train: {}".format(np.shape(x_train)))
print("Shape of y_train: {}".format(np.shape(y_train)))
print("Shape of x_val: {}".format(np.shape(x_val)))
print("Shape of y_val: {}".format(np.shape(y_val)))

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM)) #TODO: is this correct? In the example, the first dimension is
# num_words, this, however, throws an error when populating the embedding matrix (because the word-indices start at 1, not at 0)
print("shape of embedding matrix: {}".format(np.shape(embedding_matrix)))

for word, i in word_index.items():
    #print("{}: {}".format(i,word))
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = None
    
    try:
        embedding_vector = vectorspace[word]
    except:
        pass
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) #set trainable = True to enable training of the embeddings to the task at hand

Found 8273 unique tokens.
Shape of data tensor: (2, 20000, 1000)
Shape of label tensor: (20000, 4)
Shape of x_train: (2, 16000, 1000)
Shape of y_train: (16000, 4)
Shape of x_val: (2, 4000, 1000)
Shape of y_val: (4000, 4)
Preparing embedding matrix.
shape of embedding matrix: (8274, 300)


Todo next:
+ find out how to use RAM more efficiently: this model is eating RAM like a monster
+ build model architecture that uses the same layers for encoding both sentences (starting with the tutorial used above)
+ build model on top to predict entailment, neutral, contradiction (see intro functional, shared layers)
+ test different architectures

The model for encoding the sentences:

In [20]:
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#encode first sentence
embedded_sentence1 = embedding_layer(sentence1_input)
x = Conv1D(128, 5, activation='relu')(embedded_sentence1)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence1 = MaxPooling1D(35)(x)

#encode second sentence
embedded_sentence2 = embedding_layer(sentence2_input)
y = Conv1D(128, 5, activation='relu')(embedded_sentence2)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
y = MaxPooling1D(5)(y)
y = Conv1D(128, 5, activation='relu')(y)
encoded_sentence2 = MaxPooling1D(35)(y)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [encoded_sentence1, encoded_sentence2], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model1 = Model(input=[sentence1_input, sentence2_input], output=preds)
model1.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [21]:
model1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 128)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 300)     2482200     input_11[0][0]                   
                                                                   input_12[0][0]                   
____________________________________________________________________________________________________
convolution1d_10 (Convolution1D) (None, 996, 128)      192128      embedding_1[3][0]       

The same but using the same architecture for embedding both sentences, so that it will profit from all examples:

In [17]:
MAX_SEQUENCE_LENGTH = 128 #just for testing 
sentence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

sentence_embedding = embedding_layer(sentence_input)
x = Conv1D(128, 5, activation='relu')(sentence_embedding)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
encoded_sentence = MaxPooling1D(35)(x)

sentence_embedding_model = Model(input=sentence_input, output=encoded_sentence)


#sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
#not sure whether jsut using sentence input multiple times works
# or you need to specify different inputs explicitly
sentence1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sentence2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')


sentence1_embedding = sentence_embedding_model(sentence1_input)
sentence2_embedding = sentence_embedding_model(sentence2_input)

#merge the encoded sentences (First: concatenation)
merged_vector = merge(inputs = [sentence1_embedding, sentence2_embedding], mode='concat', concat_axis=-1)

#predict the labels
flat = Flatten()(merged_vector)
x = Dense(256, activation='relu')(flat)
preds = Dense(4,activation='softmax')(x)

#compile the model
model = Model(input=[sentence1_input, sentence2_input], output=preds)
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

In [22]:
sentence_embedding_model.summary()
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 128)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 300)     2482200     input_8[0][0]                    
____________________________________________________________________________________________________
convolution1d_7 (Convolution1D)  (None, 996, 128)      192128      embedding_1[2][0]                
____________________________________________________________________________________________________
maxpooling1d_7 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_7[0][0]            
___________________________________________________________________________________________

In [19]:
#TODO use generators more wisely, introduce batching, look for fitting sizes
def training_data_generator(x_train, y_train, num_batches, batch_size):
    #num_batches = int(np.floor(np.shape(y_train)[0] / batch_size))
    #print(num_batches)
    #print(num_batches*batch_size)
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_train[0,i:i+batch_size,:],x_train[1,i:i+batch_size,:]], y_train[i:i+batch_size])
        #print(i, np.shape(gen_output))
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        #print(i)
        yield gen_output
        
#    for i in np.arange(0,num_batches*batch_size, batch_size):
#        gen_output = ([x_train[0,i:i+batch_size,:],x_train[1,i:i+batch_size,:]], y_train[i:i+batch_size])
#        print(i, np.shape(gen_output))
#        yield gen_output

def val_data_generator(x_test, y_test, num_batches, batch_size):
    #num_batches = int(np.floor(np.shape(y_test)[0] / batch_size))
    i=0
    while i < num_batches*batch_size:
        gen_output = ([x_test[0,i:i+batch_size,:],x_test[1,i:i+batch_size,:]], y_test[i:i+batch_size])
        i += batch_size
        if i >= (num_batches*batch_size): i = 0
        #print(i)
        yield gen_output
        
batch_size = 128
samples_per_epoch = int(np.floor(np.shape(y_train)[0] / batch_size)) #num of steps until epoch is finished
nb_val_samples = int(np.floor(np.shape(y_val)[0] / batch_size))

model.fit_generator(training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size), 
          nb_epoch=5,
          samples_per_epoch = samples_per_epoch * batch_size,
          validation_data=val_data_generator(x_val, y_val, num_batches=nb_val_samples, batch_size= batch_size),
          nb_val_samples = nb_val_samples * batch_size,
          verbose=2)

Epoch 1/5


ValueError: Error when checking model input: expected input_9 to have shape (None, 128) but got array with shape (128, 1000)

In [26]:
gen = training_data_generator(x_train, y_train, num_batches=samples_per_epoch, batch_size= batch_size)
print(np.shape(next(gen)[0][0]))

(128, 1000)


In [36]:
??Model.fit_generator


And there we have overfitting! 
TODO:
- get it working on the complete dataset (not just 20000)
- try different models
- prevent overfitting