In [2]:
ls working/data/training

paper_dev.ns.pages.p5.jsonl            paper_test_pipeline.ps.pages.p5.jsonl
paper_test.ns.pages.p5.jsonl           train.ns.pages.p5.jsonl
paper_test_pipeline.ns.pages.p5.jsonl  train.pages.p5.jsonl


In [5]:
!head -2 working/data/training/paper_test_pipeline.ps.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1, [["Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture ."], []]], [-1, null, "Grease_gun_-LRB-tool-RRB-", -2, [[], []]], [-1, null, "Nasal_sebum", -2, [[], []]], [-1, null, "Grease", -2, [[], []]], [-1, null, "Thermal_interface_material", -2, [[], []]]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Emblem_of_the_Ukrainian_Soviet_Socialist_Republic", -2, [["The coat of arms of the Ukrainian Soviet Socialist Republic was adopted on March 14 , 1919 by the government of the Ukrain

In [15]:
class LabelSchema:
    def __init__(self,labels):
        self.labels = {self.preprocess(val):idx for idx,val in enumerate(labels)}
        self.idx = {idx:self.preprocess(val) for idx,val in enumerate(labels)}

    def get_id(self,label):
        if self.preprocess(label) in self.labels:
            return self.labels[self.preprocess(label)]
        return None

    def preprocess(self,item):
        return item.lower()

class FEVERLabelSchema(LabelSchema):
    def __init__(self):
        super().__init__(["supports", "refutes", "not enough info"])

def nltk_tokenizer(text):
    return " ".join(word_tokenize(text))

class test_line_formatter():    
    def format(self, lines):
        formatted = []
        for line in tqdm(lines):
            fl = self.format_line(line)
            if fl is not None:
                if isinstance(fl,list):
                    formatted.extend(fl)
                else:
                    formatted.append(fl)
        return formatted

    def format_line(self, line):
        label_schema = FEVERLabelSchema()
        # get the label, i.e. SUPPORTS etc.
        annotation = line["label"]
        if annotation is None:
            annotation = line["verifiable"]
        pages = []
        lines = []
        # did we get the closest sentences to the claim text? is this the sentence or the line number from the doc text?
        if 'predicted_sentences' in line:
            pages.extend([(ev[0], ev[1]) for ev in line["predicted_sentences"]])
        elif 'predicted_pages' in line:
            pages.extend([(ev[0], -1) for ev in line["predicted_pages"]])
        else:
            # only if evidence[0] is > -1, we have relevant predicted lines in evidence[4]
            for evidence_group in line["evidence"]:
                pages.extend([(ev[2], ev[3]) for ev in evidence_group])
            for evidence_group in line["evidence"]:
                for ev in evidence_group:
                    if ev[0] > -1:
                        lines.extend(ev[4][0])
        return {"claim": line["claim"], "evidence": pages, "lines": lines, "label": label_schema.get_id(annotation),
                "label_text": annotation}

In [16]:
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in tqdm(fp.readlines()):
            data.append(json.loads(line.strip()))
        return data

In [9]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_test'
working_dir = 'working/data/'
k = 5
training_data_file = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
data = jlr.read(training_data_file)

100%|██████████| 2039/2039 [00:00<00:00, 43717.47it/s]


In [13]:
data[:2]

[{'id': 113501,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Grease had bad reviews.',
  'evidence': [[[133128,
     None,
     'Grease_gun_-LRB-tool-RRB-',
     -1,
     [['Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .'],
      []]],
    [-1, None, 'Grease_gun_-LRB-tool-RRB-', -2, [[], []]],
    [-1, None, 'Nasal_sebum', -2, [[], []]],
    [-1, None, 'Grease', -2, [[], []]],
    [-1, None, 'Thermal_interface_material', -2, [[], []]]]]},
 {'id': 163803,
  'verifiable': 'VERIFIABLE',
  'label': 'SUPPORTS',
  'claim': 'Ukrainian Soviet Socialist Republic was a founding participant of the UN.',
  'evidence': [[[296950,
     288668,
     'Emblem_of_the_Ukrainian_Soviet_Socialist_Republic',
     -2,
     [['The coat of arms of the Ukrainian Soviet 

In [17]:
formatter = test_line_formatter()
formatted_test_data = formatter.format(data)

100%|██████████| 2039/2039 [00:00<00:00, 48182.69it/s]


In [19]:
formatted_test_data[:3]

[{'claim': 'Grease had bad reviews.',
  'evidence': [('Grease_gun_-LRB-tool-RRB-', -1),
   ('Grease_gun_-LRB-tool-RRB-', -2),
   ('Nasal_sebum', -2),
   ('Grease', -2),
   ('Thermal_interface_material', -2)],
  'lines': ['Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .'],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Ukrainian Soviet Socialist Republic was a founding participant of the UN.',
  'evidence': [('Emblem_of_the_Ukrainian_Soviet_Socialist_Republic', -2),
   ('Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic', -2),
   ('Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic', -2),
   ('Ukrainian_Republic', -2),
   ('List_of_Presidents_of_Ukraine', -2),
   ('United_Nations_General_Assembly_Resolution_377', -2),
   ('United_Nations_General

In [20]:
[d for d in formatted_test_data if d['label_text'] == 'NOT ENOUGH INFO'][:5]

[{'claim': 'Grease had bad reviews.',
  'evidence': [('Grease_gun_-LRB-tool-RRB-', -1),
   ('Grease_gun_-LRB-tool-RRB-', -2),
   ('Nasal_sebum', -2),
   ('Grease', -2),
   ('Thermal_interface_material', -2)],
  'lines': ['Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .'],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Legendary Entertainment is the owner of Wanda Cinemas.',
  'evidence': [('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -2),
   ('Wanda_Group', -2),
   ('Legendary_Entertainment', -2),
   ('List_of_shows_produced_by_Legendary_Television', -2)],
  'lines': ['It is a part of the Dalian Wanda Group .',
   'As of 2014 Wang Jianlin is the head of the com

In [21]:
import unicodedata
import re
def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess(w):
        w = unicode_to_ascii(w.lower().strip())
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        w = w.strip()
        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '[START] ' + w + ' [END]'
        return w

In [25]:
for data in formatted_test_data[:2]:
    claim = preprocess(data["claim"])
    lines = data["lines"]
    parts = [claim, " ".join(lines)]
    print (" ".join(parts))
    print("***********")

[START] grease had bad reviews . [END] Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .
***********
[START] ukrainian soviet socialist republic was a founding participant of the un . [END] The coat of arms of the Ukrainian Soviet Socialist Republic was adopted on March 14 , 1919 by the government of the Ukrainian Soviet Socialist Republic and subsequently modified on November 7 , 1928 , January 30 , 1937 and November 21 , 1949 . The rising sun stands for the future of the Soviet Ukrainian nation , the star as well as the hammer and sickle for the victory of communism and the `` world-wide socialist community of states '' . The name of the Ukrainian SSR is shown only in Ukrainian , and reads '' Українська PCP '' . The banner bears the Soviet Union state motto -LRB- `` Workers o

In [32]:
import numpy as np
import tensorflow as tf
def get_test_data_generator():
    for data in formatted_test_data:
        claim = preprocess(data["claim"])
        lines = data["lines"]
        yield claim, " ".join(lines)
        
def get_test_dataset():
    generator = lambda: get_test_data_generator()
    return tf.data.Dataset.from_generator(
            generator, output_signature=(
            tf.TensorSpec(shape=(2, ), dtype=tf.string)))

In [27]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [28]:
import tensorflow_text as text
bert_tokenizer_params=dict(lower_case=True)
vocab_file_out = 'working/data/fever_vocab.txt'
pt_tokenizer = text.BertTokenizer(vocab_file_out, **bert_tokenizer_params)

### One hot encode the labels

In [29]:
from sklearn import preprocessing
labels = [d['label_text'] for d in formatted_test_data]
le = preprocessing.LabelEncoder()
le.fit(labels)
labels_enc = le.transform(labels)

In [30]:
test_labels = np.zeros(shape=(len(labels_enc),3))
for idx, val in enumerate(labels_enc):
    test_labels[idx][val]=1
print("A peek a the reshaped labels:")
print(test_labels[:5])
print("The datatypes of the training dataset, features={}, labels={}".format(type(labels_enc), type(test_labels)))

A peek a the reshaped labels:
[[1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]
The datatypes of the training dataset, features=<class 'numpy.ndarray'>, labels=<class 'numpy.ndarray'>


In [31]:
lbls = tf.reshape(tf.convert_to_tensor(test_labels, dtype=tf.int32), (test_labels.shape))
lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
lbls_ds

<TensorSliceDataset shapes: (3,), types: tf.int32>

In [33]:
test_ds = get_test_dataset()

In [35]:
for h,e in test_ds.take(2):
    print(h)
    print(".....\n\n")
    print(e)
    print("********\n\n")

tf.Tensor(b'[START] grease had bad reviews . [END]', shape=(), dtype=string)
.....


tf.Tensor(b'Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .', shape=(), dtype=string)
********


tf.Tensor(b'[START] ukrainian soviet socialist republic was a founding participant of the un . [END]', shape=(), dtype=string)
.....


tf.Tensor(b"The coat of arms of the Ukrainian Soviet Socialist Republic was adopted on March 14 , 1919 by the government of the Ukrainian Soviet Socialist Republic and subsequently modified on November 7 , 1928 , January 30 , 1937 and November 21 , 1949 . The rising sun stands for the future of the Soviet Ukrainian nation , the star as well as the hammer and sickle for the victory of communism and the `` world-wide socialist community of states '' . The name of the

In [36]:
test_ds_enc_labls = tf.data.Dataset.zip((test_ds, lbls_ds))
print(test_ds_enc_labls.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Tokenize the test data and prepare the tensors for training

In [37]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = test_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = test_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = test_ds_enc_labls.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
dataset_test = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_test)
print(dataset_test.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


#### Load the prefilled embedding matrix from glove 300d

In [38]:
npzfile = np.load("working/data/embedding_mappings_300d.npz")
npzfile.files

['arr_0']

In [39]:
embedding_matrix = npzfile['arr_0']

#### Build the network

In [40]:

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.regularizers import *

vocab_size= 8000
dim = 300
inp1 = keras.Input(shape=(None, ), name = "hypothesis")
inp2 = keras.Input(shape=(None, ), name = "evidence")

embedding_hyp_layer = Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False)
embedding_evi_layer = Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False)


x_hyp = embedding_hyp_layer(inp1)
x_hyp = tf.keras.layers.Dropout(0.5)(x_hyp)

x_evi = embedding_evi_layer(inp2)
x_evi = tf.keras.layers.Dropout(0.5)(x_evi)


lstm_layer1 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True))(x_hyp)

lstm_layer2 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True))(x_evi)



F_p, F_h = lstm_layer1, lstm_layer2
Eph = keras.layers.Dot(axes=(2, 2))([F_h, F_p])  # [batch_size, Hsize, Psize]
Eh = Lambda(lambda x: keras.activations.softmax(x))(Eph)  # [batch_size, Hsize, Psize]
Ep = keras.layers.Permute((2, 1))(Eph)  # [batch_size, Psize, Hsize)
Ep = Lambda(lambda x: keras.activations.softmax(x))(Ep)  # [batch_size, Psize, Hsize]
    
    
    
# 4, Normalize score matrix, encoder premesis and get alignment
PremAlign = keras.layers.Dot((2, 1))([Ep, lstm_layer2]) # [-1, Psize, dim]
HypoAlign = keras.layers.Dot((2, 1))([Eh, lstm_layer1]) # [-1, Hsize, dim]
mm_1 = keras.layers.Multiply()([lstm_layer1, PremAlign])
mm_2 = keras.layers.Multiply()([lstm_layer2, HypoAlign])
sb_1 = keras.layers.Subtract()([lstm_layer1, PremAlign])
sb_2 = keras.layers.Subtract()([lstm_layer2, HypoAlign])
    

# concat [a_, a~, a_ * a~, a_ - a~], isto za b_, b~
PremAlign = keras.layers.Concatenate()([lstm_layer1, PremAlign, sb_1, mm_1,])  # [batch_size, Psize, 2*unit]
HypoAlign = keras.layers.Concatenate()([lstm_layer2, HypoAlign, sb_2, mm_2])  # [batch_size, Hsize, 2*unit]


# ff layer w/RELU activation
Compresser = tf.keras.layers.TimeDistributed(Dense(300,
                                   kernel_regularizer=l2(0.0),
                                   bias_regularizer=l2(0.0),
                                   activation='relu'),
                             name='Compresser')

PremAlign = Compresser(PremAlign)
HypoAlign = Compresser(HypoAlign)
    

Decoder = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True), name='finaldecoder')


PremAlign = Dropout(0.5)(PremAlign)
HypoAlign = Dropout(0.5)(HypoAlign)
final_p = Decoder(PremAlign)
final_h = Decoder(HypoAlign)


AveragePooling = tf.keras.layers.GlobalAveragePooling1D()
MaxPooling = tf.keras.layers.GlobalMaxPooling1D()

# AveragePooling = Lambda(lambda x: K.mean(x, axis=1)) # outs [-1, dim]
# MaxPooling = Lambda(lambda x: K.max(x, axis=1)) # outs [-1, dim]
avg_p = AveragePooling(final_p)
avg_h = AveragePooling(final_h)
max_p = MaxPooling(final_p)
max_h = MaxPooling(final_h)
# concat of avg and max pooling for hypothesis and premise
Final = keras.layers.Concatenate()([avg_p, max_p, avg_h, max_h])
# dropout layer
Final = Dropout(0.5)(Final)
# ff layer w/tanh activation
Final = Dense(100,
              kernel_regularizer=l2(0.0),
              bias_regularizer=l2(0.0),
              name='dense300_',
              activation='tanh')(Final)

# last dropout factor
factor = 1
# if self.LastDropoutHalf:
#     factor = 2
Final = Dropout(0.5 / factor)(Final)

# softmax classifier
Final = Dense(3,
              activation='softmax',
              name='judge300_')(Final)
model = tf.keras.Model(inputs=[inp1, inp2], outputs=Final)

LearningRate = 4e-4
GradientClipping = 10.0

# Optimizer = keras.optimizers.Adam(lr = LearningRate,
#             clipnorm = GradientClipping)

model.compile(loss='categorical_crossentropy',
          optimizer='adam',
          metrics=['accuracy'])

model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hypothesis (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    2400300     hypothesis[0][0]                 
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2400300     evidence[0][0]                   
______________________________________________________________________________________________

In [41]:
checkpoint_filepath = 'tmp/attention_esim/checkpoint_fever_rte_esim'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

#### Check the test accuracy

In [42]:
model.load_weights(checkpoint_filepath)
model.evaluate(dataset_test)



[1.803281307220459, 0.5569556355476379]