In [31]:
import spacy
import numpy as np

In [24]:
nlp = spacy.load('en_vectors_web_lg')

In [83]:
import ujson as json
from keras.utils import to_categorical

LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
def read_snli(path):
    texts1 = []
    texts2 = []
    labels = []
    with open(path, 'r') as file_:
        for line in file_:
            eg = json.loads(line)
            label = eg['gold_label']
            if label == '-':  # per Parikh, ignore - SNLI entries
                continue
            texts1.append(eg['sentence1'])
            texts2.append(eg['sentence2'])
            labels.append(LABELS[label])
    return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))

In [279]:
t1,t2,c = read_snli('snli/snli_1.0_train.jsonl')

In [286]:
def create_dataset(nlp, texts, hypotheses, num_oov, max_length):
    sents = texts + hypotheses
    sents_as_ids = []
    rank_vector_map = {}
    vectors = []
    vocab = {}
    
    # create random vectors for OOV tokens
    oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))
    oov = oov / oov.sum(axis=1, keepdims=True)
    
    vector_id = num_oov #  
    for sent in sents:
        doc = nlp(sent)
        word_ids = []
        
        for i, token in enumerate(doc):
            if token.has_vector and token.vector_norm == 0:
                continue
                
            if i > max_length:
                break
                
            if token.rank in rank_vector_map:
                word_ids.append(rank_vector_map[token.rank])
            else:
                if token.has_vector:
                    rank_vector_map[token.rank] = vector_id
                    vocab[vector_id] = token.text
                    word_ids.append(vector_id)
                    vector_id += 1
                    vectors.append(token.vector / token.vector_norm)
                else:
                    word_ids.append(token.rank % num_oov)
                    
        word_id_vec = np.zeros((max_length))
        clipped_len = min(max_length, len(word_ids))
        word_id_vec[:clipped_len] = word_ids[:clipped_len]
        sents_as_ids.append(word_id_vec)
                    
    embeddings = np.zeros((len(vectors) + num_oov, nlp.vocab.vectors_length))
    embeddings[:num_oov, ] = oov
    embeddings[num_oov:, ] = np.array(vectors)
    
    return embeddings, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):]), vocab
                

In [285]:
e,t,h,v = create_dataset(nlp, t1, t2, 100, 50)

In [289]:
len(t),len(v),e.shape

(549367, 35125, (35225, 300))

In [46]:
from keras import layers, Model

In [290]:
def embed(words, vectors, max_length, reduced_dim):
    embed = layers.Embedding(
        vectors.shape[0],
        vectors.shape[1],
        input_length=max_length,
        weights=[vectors],
        trainable=False)(words)
    
    # note that the text and the hypotheses will get different projection matrices
    reduced_embedding = layers.TimeDistributed(
        layers.Dense(reduced_dim,
                     activation=None,
                     use_bias=False,name='project'))(embed)
    
    return reduced_embedding


def build_model(vectors, max_length, num_hidden, num_classes, reduced_dim):
    input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
    input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
    
    out1 = embed(input1, vectors, max_length, reduced_dim)
    out2 = embed(input2, vectors, max_length, reduced_dim)
    
    concat = layers.concatenate([out1, out2])
    concat = layers.Flatten()(concat)
    
    out = layers.Dense(200, activation='relu')(concat)
    out = layers.Dropout(0.2)(concat)
    out = layers.Dense(200, activation='relu')(out)
    out = layers.Dense(num_classes, activation='softmax')(out)
    
    model = Model([input1, input2], out)
    
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model
    
    
    

In [291]:
m = build_model(e, 50, 200, 3, 200)
m.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words1 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
words2 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_51 (Embedding)        (None, 50, 300)      10567500    words1[0][0]                     
__________________________________________________________________________________________________
embedding_52 (Embedding)        (None, 50, 300)      10567500    words2[0][0]                     
__________________________________________________________________________________________________
time_distr

In [None]:
m.fit([t, h], c, batch_size=256, epochs=10,verbose=1, validation_split=.2)

Train on 439493 samples, validate on 109874 samples
Epoch 1/10

In [179]:
s = np.sum(t, axis=1)

In [237]:
[i for i,x in enumerate(np.sum(e,axis=1)) if x==0]


[]

In [266]:
t1[186],t2[186]

('Number 916 is hoping that he is going to win the race.',
 'A person is betting that he will win  the race.')

In [265]:
t1,t2,c = read_snli('snli/snli_1.0_dev.jsonl')

In [267]:
for t in nlp(t2[186]):
    print (t.text, t.vector)

A [ 4.3798e-02  2.4779e-02 -2.0937e-01  4.9745e-01  3.6019e-01 -3.7503e-01
 -5.2078e-02 -6.0555e-01  3.6744e-02  2.2085e+00 -2.3389e-01 -6.8360e-02
 -2.2355e-01 -5.3989e-02 -1.5198e-01 -1.7319e-01  5.3355e-02  1.6485e+00
 -4.7991e-02 -8.5311e-02 -1.5712e-01 -6.4425e-01 -3.9819e-01  2.7800e-01
  1.5364e-01  3.1678e-02  5.5414e-02  1.5939e-02  3.1851e-01 -5.8979e-02
  3.8584e-02  1.0770e-01  1.0410e-01 -7.7346e-02  3.7396e-01 -2.1482e-01
  3.8320e-01 -2.7737e-01 -1.8352e-01 -8.3838e-01  3.4124e-01  5.8164e-01
  1.8543e-01 -3.1028e-01  1.7666e-01 -6.9421e-02 -3.4422e-01 -1.3665e-01
 -1.0823e-01  2.3637e-01 -3.2923e-01  6.1348e-01  1.9720e-01  8.7123e-02
  1.0785e-01  3.0730e-01  1.3757e-01  3.0809e-01  2.4331e-01 -2.9422e-01
 -9.8214e-03  5.5675e-01 -4.8880e-02  9.9468e-02  3.0543e-01 -3.7597e-01
 -1.9525e-01  4.6246e-02 -3.6675e-02  3.4023e-01  1.4905e-01  9.7800e-02
 -2.6664e-01  5.6834e-02 -4.3201e-02 -2.3338e-01  1.3111e-01 -3.5742e-01
 -3.6070e-01  3.0997e-01 -1.9727e-01 -1.4320e-01 