#### Set the file names and paths

In [1]:
#embeddings
glove_zip_file = "glove.6B.zip"
glove_vectors_file = "glove.6B.50d.txt"

#data files
snli_zip_file = "snli_1.0.zip"
snli_dev_file = "snli_1.0_dev.txt"
snli_full_dataset_file = "snli_1.0_train.txt"

#### A data loader from zip file

In [3]:
import os
import zipfile
import io
import numpy as np
from tqdm import tqdm
zip_file_name = 'entailment/data/snli_1.0.zip'
output_file_name = 'snli_1.0/snli_1.0_train.txt'

def load_data(num_samples=10):
    counter = 0
    columns = ['gold_label','sentence1','sentence2']
    indices = [-1, -1, -1]
    data = []
    with zipfile.ZipFile(zip_file_name) as z:
        for info in z.infolist():
            if output_file_name in info.filename:
                # read the file
                print("Reading lines from file {}".format(output_file_name))
                with io.TextIOWrapper(z.open(output_file_name), encoding="utf-8") as f:
                    for line in tqdm(f):
                        terms = line.split('\t')
                        if np.min(indices) == -1: # this is the first line
                            indices = [np.where(np.array(terms) == val)[0] for val in columns]
                            counter += 1
                        else:
                            idx = [i[0] for i in indices]
                            #do not include the '-' label
                            if np.array(terms)[idx][0] != '-':
                                data.append(np.array(terms)[idx])
                                counter += 1
                        if (num_samples > -1) & (counter > num_samples):
                            return data

In [7]:
data = load_data(5000)
data = [[d[0], d[1],d[2]] for d in data[:]]
print("Read {} records".format(len(data)))
data[:3]

5006it [00:00, 37071.21it/s]

Reading lines from file snli_1.0/snli_1.0_train.txt
Read 5000 records





[['neutral',
  'A person on a horse jumps over a broken down airplane.',
  'A person is training his horse for a competition.'],
 ['contradiction',
  'A person on a horse jumps over a broken down airplane.',
  'A person is at a diner, ordering an omelette.'],
 ['entailment',
  'A person on a horse jumps over a broken down airplane.',
  'A person is outdoors, on a horse.']]

The d type instances in AWS has issues with LSTM, we need to change these settings

In [8]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
from tensorflow import keras
print(tf.__version__)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2.5.0


#### Pre-process the texts

In [9]:
import unicodedata
import re
def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess(w):
        w = unicode_to_ascii(w.lower().strip())
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        w = w.strip()
        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w
def build_tokenizer(data):
    #combine the hypothesis and the evidence into one setence with the seperators in between.
    all_texts = [" ".join((preprocess(d[1]),preprocess(d[2]))) for d in data]
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
    lang_tokenizer.fit_on_texts(all_texts)
    return lang_tokenizer

#### The tokenizers

In [28]:
lang_tokenizer = build_tokenizer(data)
hyp_tokens = lang_tokenizer.texts_to_sequences([preprocess(d[1]) for d in data])
hyp_tokens = tf.keras.preprocessing.sequence.pad_sequences(hyp_tokens, padding='post')
evi_tokens = lang_tokenizer.texts_to_sequences([preprocess(d[2]) for d in data])
evi_tokens = tf.keras.preprocessing.sequence.pad_sequences(evi_tokens, padding='post')
print("Shape of hypothesis = {} and evidence = {}".format(hyp_tokens.shape, evi_tokens.shape))

Shape of hypothesis = (5000, 61) and evidence = (5000, 29)


##### One hot encode the labels

In [11]:
from sklearn import preprocessing

labels = [d[0] for d in data]

le = preprocessing.LabelEncoder()
le.fit(labels)
labels_enc = le.transform(labels)

train_labels = np.zeros(shape=(len(labels_enc),3))
for idx, val in enumerate(labels_enc):
    train_labels[idx][val]=1
print("A peek a the reshaped labels:")
print(train_labels[:5])
print("The datatypes of the training dataset, features={}, labels={}".format(type(labels_enc), type(train_labels)))

A peek a the reshaped labels:
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
The datatypes of the training dataset, features=<class 'numpy.ndarray'>, labels=<class 'numpy.ndarray'>


#### Prepare the embeddings
Load the glove embeddings

In [13]:
glove_zip_file = "entailment/data/glove.6B.zip"
glove_vectors_file = "glove.6B.100d.txt"
embeddings_index = {}
with zipfile.ZipFile(glove_zip_file) as z:
        for info in z.infolist():
            print(info.filename)
            if glove_vectors_file in info.filename:
                # read the file
                print("Reading lines from file {}".format(glove_vectors_file))
                with io.TextIOWrapper(z.open(glove_vectors_file), encoding="utf-8") as f:
                    for line in f:
                        terms = line.split()
                        word = terms[0]
                        coefs = np.asarray(terms[1:], dtype='float32')
                        
                        embeddings_index[word] = coefs

glove.6B.50d.txt
glove.6B.100d.txt
Reading lines from file glove.6B.100d.txt
glove.6B.200d.txt
glove.6B.300d.txt


In [15]:
## 'the' is a very common word, find the max length
max_length = embeddings_index["the"].shape[0]

embedding_matrix = np.zeros((len(lang_tokenizer.word_index) + 1, max_length))
for word, i in lang_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

#### Build the inputs to the network

In [24]:
from sklearn.model_selection import train_test_split
input_tensor_train_h, input_tensor_val_h, input_tensor_train_e, input_tensor_val_e,\
    target_tensor_train, target_tensor_val \
    = train_test_split(hyp_tokens, evi_tokens, train_labels, test_size=0.3)

In [25]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64

dataset_12 = tf.data.Dataset.from_tensor_slices((input_tensor_train_h, input_tensor_train_e))
dataset_label = tf.data.Dataset.from_tensor_slices(target_tensor_train)
train_dataset = tf.data.Dataset.zip((dataset_12, dataset_label))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


dataset_12_val = tf.data.Dataset.from_tensor_slices((input_tensor_val_h, input_tensor_val_e))
dataset_label_val = tf.data.Dataset.from_tensor_slices(target_tensor_val)
val_dataset = tf.data.Dataset.zip((dataset_12_val, dataset_label_val))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [27]:
train_dataset.element_spec

((TensorSpec(shape=(64, 61), dtype=tf.int32, name=None),
  TensorSpec(shape=(64, 29), dtype=tf.int32, name=None)),
 TensorSpec(shape=(64, 3), dtype=tf.float64, name=None))

#### Build the model network

In [30]:
vocab_size= len(lang_tokenizer.word_index)
dim = embedding_matrix.shape[1]
#dim = 50 #keep it same as the dim of the embedding matrix so that we can compare
embedding_hyp_layer = tf.keras.layers.Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False,
        # Use masking to handle the variable sequence lengths
        mask_zero=True)
embedding_evi_layer = tf.keras.layers.Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False,
        # Use masking to handle the variable sequence lengths
        mask_zero=True)

inp1 = keras.Input(shape=(None,))
inp2 = keras.Input(shape=(None,))
x_hyp = embedding_hyp_layer(inp1)
x_evi = embedding_evi_layer(inp2)

#this throws an error in the d type instances in AWS, works on p type instances
#hyp_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(dim))(x_hyp)
hyp_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim)))(x_hyp)
hyp_evi = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim)))(x_evi)

#tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(64))),
w = keras.layers.concatenate([hyp_lstm, hyp_evi], axis = 1)

x3 = tf.keras.layers.Dense(16, activation='relu')(w)
x4 = tf.keras.layers.Dropout(0.1)(x3)
output = tf.keras.layers.Dense(3, activation='softmax')(x4)
    
model = keras.Model(inputs=[inp1, inp2], outputs=output)
model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(), 
          metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    401300      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 100)    401300      input_4[0][0]                    
____________________________________________________________________________________________

In [31]:
model.fit(train_dataset, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f79f5ee2810>

#### Try a simple model for looking under the hood

In [34]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64

dataset_1 = tf.data.Dataset.from_tensor_slices(input_tensor_train_h)
dataset_label = tf.data.Dataset.from_tensor_slices(target_tensor_train)
train_ds = tf.data.Dataset.zip((dataset_1, dataset_label))
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
train_ds

<BatchDataset shapes: ((64, 61), (64, 3)), types: (tf.int32, tf.float64)>

In [43]:
vocab_size= len(lang_tokenizer.word_index)
dim = embedding_matrix.shape[1]
#dim = 50 #keep it same as the dim of the embedding matrix so that we can compare
embedding = tf.keras.layers.Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False,
        # Use masking to handle the variable sequence lengths
        mask_zero=True)

inp1 = keras.Input(shape=(None,))
#inp2 = keras.Input(shape=(None,))
x = embedding(inp1)
lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim)))(x)
x1 = tf.keras.layers.Dense(16, activation='relu')(lstm_layer)
x2 = tf.keras.layers.Dropout(0.1)(x1)
output = tf.keras.layers.Dense(3, activation='softmax')(x2)
    
model = keras.Model(inputs=[inp1], outputs=output)
model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(), 
          metrics=['accuracy'])
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_9 (Embedding)      (None, None, 100)         401300    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_14 (Dense)             (None, 16)                3216      
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 3)                 51        
Total params: 565,367
Trainable params: 164,067
Non-trainable params: 401,300
_______________________________________________

In [44]:
model.fit(train_ds, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f79ec5187d0>