In [1]:
from __future__ import print_function
from functools import reduce
import json
import os
import re
import tarfile
import tempfile

import numpy as np
np.random.seed(1337)  # for reproducibility

In [2]:
import keras
import keras.backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import merge, recurrent, Dense, Input, Dropout, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import np_utils

Using TensorFlow backend.


# Data Visualization

In [3]:
import pandas as pd
data=pd.read_json('snli_1.0_train.jsonl', lines=True)


In [4]:
data.head()

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2,[entailment],3416050480.jpg#4,entailment,3416050480.jpg#4r1e,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is outdoors, on a horse.","( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3,[neutral],2267923837.jpg#2,neutral,2267923837.jpg#2r1n,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,They are smiling at their parents,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4,[entailment],2267923837.jpg#2,entailment,2267923837.jpg#2r1e,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,There are children present,( There ( ( are children ) present ) ),(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...


In [5]:
def extract_tokens_from_binary_parse(parse):
    return parse.replace('(', ' ').replace(')', ' ').replace('-LRB-', '(').replace('-RRB-', ')').split()

In [6]:
def yield_examples(fn, skip_no_majority=True, limit=None):
  for i, line in enumerate(open(fn)):
    if limit and i > limit:
      break
    data = json.loads(line)
    label = data['gold_label']
    s1 = ' '.join(extract_tokens_from_binary_parse(data['sentence1_binary_parse']))
    s2 = ' '.join(extract_tokens_from_binary_parse(data['sentence2_binary_parse']))
    if skip_no_majority and label == '-':
      continue
    yield (label, s1, s2)

In [7]:
def get_data(fn, limit=None):
  raw_data = list(yield_examples(fn=fn, limit=limit))
  left = [s1 for _, s1, s2 in raw_data]
  right = [s2 for _, s1, s2 in raw_data]
  print(max(len(x.split()) for x in left))
  print(max(len(x.split()) for x in right))

  LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
  Y = np.array([LABELS[l] for l, s1, s2 in raw_data])
  Y = np_utils.to_categorical(Y, len(LABELS))

  return left, right, Y

In [8]:
training = get_data('snli_1.0_train.jsonl')
validation = get_data('snli_1.0_dev.jsonl')
test = get_data('snli_1.0_test.jsonl')


82
62
59
55
57
30


In [9]:
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(training[0] + training[1])

In [10]:
type(training)

tuple

In [11]:
training

(['A person on a horse jumps over a broken down airplane .',
  'A person on a horse jumps over a broken down airplane .',
  'A person on a horse jumps over a broken down airplane .',
  'Children smiling and waving at camera',
  'Children smiling and waving at camera',
  'Children smiling and waving at camera',
  'A boy is jumping on skateboard in the middle of a red bridge .',
  'A boy is jumping on skateboard in the middle of a red bridge .',
  'A boy is jumping on skateboard in the middle of a red bridge .',
  'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .',
  'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .',
  'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .',
  'Two blond women are hugging one another .',

In [12]:
len(training)

3

In [13]:
training[0]

['A person on a horse jumps over a broken down airplane .',
 'A person on a horse jumps over a broken down airplane .',
 'A person on a horse jumps over a broken down airplane .',
 'Children smiling and waving at camera',
 'Children smiling and waving at camera',
 'Children smiling and waving at camera',
 'A boy is jumping on skateboard in the middle of a red bridge .',
 'A boy is jumping on skateboard in the middle of a red bridge .',
 'A boy is jumping on skateboard in the middle of a red bridge .',
 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .',
 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .',
 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .',
 'Two blond women are hugging one another .',
 'Two blond 

In [14]:
training[1]

['A person is training his horse for a competition .',
 'A person is at a diner , ordering an omelette .',
 'A person is outdoors , on a horse .',
 'They are smiling at their parents',
 'There are children present',
 'The kids are frowning',
 'The boy skates down the sidewalk .',
 'The boy does a skateboarding trick .',
 'The boy is wearing safety equipment .',
 'An older man drinks his juice as he waits for his daughter to get off work .',
 'A boy flips a burger .',
 'An elderly man sits in a small shop .',
 'Some women are hugging on vacation .',
 'The women are sleeping .',
 'There are women showing affection .',
 'The people are eating omelettes .',
 'The people are sitting at desks in school .',
 'The diners are at a restaurant .',
 'A man is drinking juice .',
 'Two women are at a restaurant drinking wine .',
 'A man in a restaurant is waiting for his meal to arrive .',
 'A blond man getting a drink of water from a fountain in the park .',
 'A blond man wearing a brown shirt is r

In [15]:
training[2]

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

In [16]:
len(training[0]),len(training[1]), len(training[2])

(549367, 549367, 549367)

In [17]:
# Lowest index from the tokenizer is 1 - we need to include 0 in our vocab count
VOCAB = len(tokenizer.word_counts) + 1
LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}

In [18]:
type(VOCAB)

int

In [19]:
VOCAB

42391

In [20]:
RNN = None
LAYERS = 1
USE_GLOVE = True
TRAIN_EMBED = False
EMBED_HIDDEN_SIZE = 300
SENT_HIDDEN_SIZE = 300
BATCH_SIZE = 512
PATIENCE = 4 # 8
MAX_EPOCHS = 42
MAX_LEN = 42
DP = 0.2
L2 = 4e-6
ACTIVATION = 'relu'
OPTIMIZER = 'rmsprop'
print('RNN / Embed / Sent = {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE))
print('GloVe / Trainable Word Embeddings = {}, {}'.format(USE_GLOVE, TRAIN_EMBED))

RNN / Embed / Sent = None, 300, 300
GloVe / Trainable Word Embeddings = True, False


In [21]:
to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

In [22]:
training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

In [23]:
training[0]

array([[  0,   0,   0, ...,  40, 822,   1],
       [  0,   0,   0, ...,  40, 822,   1],
       [  0,   0,   0, ...,  40, 822,   1],
       ...,
       [  0,   0,   0, ...,  34,  51,   1],
       [  0,   0,   0, ...,  34,  51,   1],
       [  0,   0,   0, ...,  34,  51,   1]], dtype=int32)

In [24]:
len(training[0]), training[0].shape

(549367, (549367, 42))

In [25]:
print('Build model...')
print('Vocab size =', VOCAB)

Build model...
Vocab size = 42391


In [26]:
for i in range(15):
    print(training[1][i])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    3   45    5 1175   21  193   38    2  456    1]
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     3    45     5    17     2
  2441    15  2384    30 26609     1]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3  45   5
 151  15   8   2 193   1]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 335  10 164  17  52 986]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   

In [27]:
GLOVE_STORE = 'precomputed_glove.weights'

if USE_GLOVE:
    if not os.path.exists(GLOVE_STORE + '.npy'):
        print('Computing GloVe')
  
        embeddings_index = {}
        f = open('glove.840B.300d.txt')
    
        for line in f:
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        # prepare embedding matrix
        embedding_matrix = np.zeros((VOCAB, EMBED_HIDDEN_SIZE))
        for word, i in tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:

            # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
            else:
                print('Missing from GloVe: {}'.format(word))
  
        np.save(GLOVE_STORE, embedding_matrix)

    print('Loading GloVe')
    embedding_matrix = np.load(GLOVE_STORE + '.npy')

    print('Total number of null word embeddings:')
    print(np.sum(np.sum(embedding_matrix, axis=1) == 0))

    embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=TRAIN_EMBED)
else:
    embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, input_length=MAX_LEN)

Loading GloVe
Total number of null word embeddings:
4043


In [28]:
rnn_kwargs = dict(output_dim=SENT_HIDDEN_SIZE, dropout_W=DP, dropout_U=DP)
SumEmbeddings = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1), output_shape=(SENT_HIDDEN_SIZE, ))


In [29]:
translate = TimeDistributed(Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION))

In [30]:
premise = Input(shape=(MAX_LEN,), dtype='int32')
hypothesis = Input(shape=(MAX_LEN,), dtype='int32')

In [31]:
prem = embed(premise)
hypo = embed(hypothesis)

In [32]:
prem = translate(prem)
hypo = translate(hypo)

In [33]:
if RNN and LAYERS > 1:
    for l in range(LAYERS - 1):
        rnn = RNN(return_sequences=True, **rnn_kwargs)
        prem = BatchNormalization()(rnn(prem))
        hypo = BatchNormalization()(rnn(hypo))
rnn = SumEmbeddings if not RNN else RNN(return_sequences=False, **rnn_kwargs)
prem = rnn(prem)
hypo = rnn(hypo)
prem = BatchNormalization()(prem)
hypo = BatchNormalization()(hypo)


In [34]:
type(prem), type(hypo)

(tensorflow.python.framework.ops.Tensor,
 tensorflow.python.framework.ops.Tensor)

In [35]:
prem

<tf.Tensor 'batch_normalization_1/cond/Merge:0' shape=(?, 300) dtype=float32>

In [36]:
premise

<tf.Tensor 'input_1:0' shape=(?, 42) dtype=int32>

In [39]:
joint = tf.keras.layers.concatenate([prem,hypo])
joint = Dropout(DP)(joint)

for i in range(3):
    joint = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION, W_regularizer=l2(L2) if L2 else None)(joint)
    joint = Dropout(DP)(joint)
    joint = BatchNormalization()(joint)

  """


In [40]:
pred = Dense(len(LABELS), activation='softmax')(joint)

In [41]:
model = Model(input=[premise, hypothesis], output=pred)
model.compile(optimizer=OPTIMIZER, loss='categorical_crossentropy', metrics=['accuracy'])

  """Entry point for launching an IPython kernel.


In [42]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 42)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 42)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 42, 300)      12717300    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 42, 300)      90300       embedding_1[0][0]                
          

In [43]:
print('Training')
_, tmpfn = tempfile.mkstemp()

Training


In [44]:
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [EarlyStopping(patience=PATIENCE), ModelCheckpoint(tmpfn, save_best_only=True, save_weights_only=True)]
model.fit([training[0], training[1]], training[2], batch_size=BATCH_SIZE, nb_epoch=MAX_EPOCHS, validation_data=([validation[0], validation[1]], validation[2]), callbacks=callbacks)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 549367 samples, validate on 9842 samples
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
  8704/549367 [..............................] - ETA: 7:28 - loss: 0.6111 - acc: 0.7526

KeyboardInterrupt: 

In [45]:
model.load_weights(tmpfn)

loss, acc = model.evaluate([test[0], test[1]], test[2], batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Test loss / test accuracy = 0.6165 / 0.7559
