#### Evaluate the model on dev dataset

The test dataset will use predicted pages and predicted sentences. 

The predictions are generated via a seperate process in the our pipeline which must be executed before this step.


In [None]:
import importlib

In [2]:
from mda.src.readers.DatasetReader import DatasetReader

We need the label encoder, we will generate them from the training data

In [3]:
infile = 'working/data/training/train.ns.pages.p5.jsonl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=None, database_path='/local/fever-common/data/fever/fever.db')
raw, data = dsreader.read()

100%|██████████| 145449/145449 [00:02<00:00, 48520.31it/s]
100%|██████████| 145449/145449 [00:01<00:00, 77898.07it/s] 


In [4]:
ds_train = dsreader.get_dataset()

In [5]:
## save the label encoder
import pickle
with open('working/data/training/label_encoder_train.pkl', 'wb') as f:
    pickle.dump(dsreader.labelencoder, f)

### Load dev data
Use the saved label encodings from training

In [7]:
infile = 'working/data/dev/paper_dev_pipeline.ps.pages.p5.jsonl'
label_checkpoint_file = 'working/data/training/label_encoder_train.pkl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=label_checkpoint_file, database_path='/local/fever-common/data/fever/fever.db', type='test')
raw_test, test_data = dsreader.read()

100%|██████████| 9999/9999 [00:00<00:00, 12199.55it/s]
100%|██████████| 9999/9999 [00:00<00:00, 48049.33it/s]


In [8]:
ds_test = dsreader.get_dataset()
print(ds_test.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Load the BERT tokenizer

The FEVER vocab file is build using tokens that were concatenations of the train and the dev dataset

In [10]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow_text as text
bert_tokenizer_params=dict(lower_case=True)
vocab_file_out = 'working/data/fever_vocab.txt'
pt_tokenizer = text.BertTokenizer(vocab_file_out, **bert_tokenizer_params)

#### Prepare the tensor dataset for evaluation

In [12]:
import tensorflow as tf
import numpy as np

In [13]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = ds_test.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = ds_test.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = ds_test.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
# do not shuffle
dataset_test = d.batch(BATCH_SIZE, drop_remainder=True)
print(dataset_test)
print(dataset_test.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


#### Load the prefilled embedding matrix from glove 300d

In [14]:
npzfile = np.load("working/data/embedding_mappings_300d.npz")
npzfile.files

['arr_0']

In [15]:
embedding_matrix = npzfile['arr_0']

#### Build the network

In [16]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.regularizers import *

vocab_size= 8000
dim = 300
inp1 = keras.Input(shape=(None, ), name = "hypothesis")
inp2 = keras.Input(shape=(None, ), name = "evidence")

embedding_hyp_layer = Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False)
embedding_evi_layer = Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False)


x_hyp = embedding_hyp_layer(inp1)
x_hyp = tf.keras.layers.Dropout(0.5)(x_hyp)

x_evi = embedding_evi_layer(inp2)
x_evi = tf.keras.layers.Dropout(0.5)(x_evi)


lstm_layer1 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True))(x_hyp)

lstm_layer2 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True))(x_evi)


F_p, F_h = lstm_layer1, lstm_layer2
Eph = keras.layers.Dot(axes=(2, 2))([F_h, F_p])  # [batch_size, Hsize, Psize]
Eh = Lambda(lambda x: keras.activations.softmax(x))(Eph)  # [batch_size, Hsize, Psize]
Ep = keras.layers.Permute((2, 1))(Eph)  # [batch_size, Psize, Hsize)
Ep = Lambda(lambda x: keras.activations.softmax(x))(Ep)  # [batch_size, Psize, Hsize]

# 4, Normalize score matrix, encoder premesis and get alignment
PremAlign = keras.layers.Dot((2, 1))([Ep, lstm_layer2]) # [-1, Psize, dim]
HypoAlign = keras.layers.Dot((2, 1))([Eh, lstm_layer1]) # [-1, Hsize, dim]
mm_1 = keras.layers.Multiply()([lstm_layer1, PremAlign])
mm_2 = keras.layers.Multiply()([lstm_layer2, HypoAlign])
sb_1 = keras.layers.Subtract()([lstm_layer1, PremAlign])
sb_2 = keras.layers.Subtract()([lstm_layer2, HypoAlign])
    

# concat [a_, a~, a_ * a~, a_ - a~], isto za b_, b~
PremAlign = keras.layers.Concatenate()([lstm_layer1, PremAlign, sb_1, mm_1,])  # [batch_size, Psize, 2*unit]
HypoAlign = keras.layers.Concatenate()([lstm_layer2, HypoAlign, sb_2, mm_2])  # [batch_size, Hsize, 2*unit]


# ff layer w/RELU activation
Compresser = tf.keras.layers.TimeDistributed(Dense(300,
                                   kernel_regularizer=l2(0.0),
                                   bias_regularizer=l2(0.0),
                                   activation='relu'),
                             name='Compresser')

PremAlign = Compresser(PremAlign)
HypoAlign = Compresser(HypoAlign)
    

Decoder = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True), name='finaldecoder')


PremAlign = Dropout(0.5)(PremAlign)
HypoAlign = Dropout(0.5)(HypoAlign)
final_p = Decoder(PremAlign)
final_h = Decoder(HypoAlign)


AveragePooling = tf.keras.layers.GlobalAveragePooling1D()
MaxPooling = tf.keras.layers.GlobalMaxPooling1D()

# AveragePooling = Lambda(lambda x: K.mean(x, axis=1)) # outs [-1, dim]
# MaxPooling = Lambda(lambda x: K.max(x, axis=1)) # outs [-1, dim]
avg_p = AveragePooling(final_p)
avg_h = AveragePooling(final_h)
max_p = MaxPooling(final_p)
max_h = MaxPooling(final_h)
# concat of avg and max pooling for hypothesis and premise
Final = keras.layers.Concatenate()([avg_p, max_p, avg_h, max_h])
# dropout layer
Final = Dropout(0.5)(Final)
# ff layer w/tanh activation
Final = Dense(100,
              kernel_regularizer=l2(0.0),
              bias_regularizer=l2(0.0),
              name='dense300_',
              activation='tanh')(Final)

# last dropout factor
factor = 1
# if self.LastDropoutHalf:
#     factor = 2
Final = Dropout(0.5 / factor)(Final)

# softmax classifier
Final = Dense(3,
              activation='softmax',
              name='judge300_')(Final)
model = tf.keras.Model(inputs=[inp1, inp2], outputs=Final)

LearningRate = 4e-4
GradientClipping = 10.0

# Optimizer = keras.optimizers.Adam(lr = LearningRate,
#             clipnorm = GradientClipping)

model.compile(loss='categorical_crossentropy',
          optimizer='adam',
          metrics=['accuracy'])

model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hypothesis (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    2400300     hypothesis[0][0]                 
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2400300     evidence[0][0]                   
______________________________________________________________________________________________

#### Evaluate model accuracy on DEV dataset

In [17]:
checkpoint_filepath = 'tmp/attention_esim/checkpoint_fever_rte_esim'
model.load_weights(checkpoint_filepath)
model.evaluate(dataset_test)



[1.902360200881958, 0.5639022588729858]

#### Calculate the FEVER score

- Strictly correct: when all the evidences predicted are correct and the predicted label is correct
- Correct: when only the predicted label is correct 

#### Compute the precision and recall

In [19]:
y_pred_proba = model.predict(dataset_test)

In [21]:
y_pred = np.argmax(y_pred_proba, axis = 1)

In [22]:
outfile = 'working/data/dev_y_preds.npz'
np.savez(outfile, y_pred)

In [23]:
ds_y = dataset_test.map(lambda f, l: l)
y_test_onehot = []
for d in ds_y.batch(1):
    for d1 in d:
        y_test_onehot.append(d1.numpy())

In [24]:
y_test = np.array([np.argmax(a, axis=1) for a in y_test_onehot]).flatten()

In [25]:
outfile = 'working/data/dev_y_tests.npz'
np.savez(outfile, y_test)

In [26]:
[d['label_text'] for d in test_data[:4]]

['NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'NOT ENOUGH INFO']

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
#['NOT ENOUGH INFO', 'REFUTES', 'SUPPORTS'] == [0, 1, 2]

              precision    recall  f1-score   support

           0       0.47      0.87      0.61      3325
           1       0.81      0.30      0.44      3331
           2       0.65      0.52      0.58      3328

    accuracy                           0.56      9984
   macro avg       0.65      0.56      0.54      9984
weighted avg       0.65      0.56      0.54      9984



### In dataset type B

We have predicted pages and predicted sentences per page for each claim. We need the predictions for those pages and sentences to compute the FEVER score.

In [28]:
raw_test[:1][0]['evidence']

[[[108548,
   None,
   'Colin_Kaepernick',
   -1,
   [["Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion ."],
    []]],
  [-1,
   None,
   'Colin_Kaepernick',
   -2,
   [["He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .",
     'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .',
     'In 2016 , Kaepernick gained national attention when he began protesting by not standing while the United States national anthem was being performed before the start of games , motivated by what he viewed as the oppression of non-white races in the U.S. His actions prompted a wide variety of responses , including additional athletes in the NFL and other U.S.

### In dataset type A

We also need the original annotated pages and the sentences from the original file.

In [30]:
#!head -2 working/data/training/paper_test.ns.pages.p5.jsonl
#re-read the original data, for the annotated evidences
import json
from tqdm import tqdm
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding
    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)
    def process(self,f):
        pass

class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in tqdm(fp.readlines()):
            data.append(json.loads(line.strip()))
        return data
    
jlr = JSONLineReader()
split = 'paper_dev'
working_dir = 'working/data/'
k = 5
test_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
data_orig = jlr.read(test_data_file)
orig_evidences = [d['evidence'] for d in data_orig[:len(y_test)]]

100%|██████████| 9999/9999 [00:00<00:00, 12474.35it/s]


Generate the final predictions, for the label, the predicted pages and the sentences.

In [31]:
split = 'paper_dev_predicted'
k = 5
with open(working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
    print("Saving to training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k))
    for rec, orig, true_label, predicted_label in tqdm(zip(raw_test[:len(y_test)], orig_evidences, y_test, y_pred)):
        evs = []
        for evidence_group in rec['evidence']:
            for evidence in evidence_group:
                #print(evidence)
                if evidence[0] > -1:
                    ev = [evidence[0], evidence[1], evidence[2], evidence[4][1]]
                    evs.append(ev)

        out = {'true_label': str(true_label), 'predicted_label': str(predicted_label), 'orig': orig, 'pred': evs}

        f_out.write(json.dumps(out) + "\n")

1400it [00:00, 13901.25it/s]

Saving to training/paper_dev_predicted_pipeline.ps.pages.p5.jsonl


9984it [00:00, 18979.29it/s]


Load the predictions from the file to use in fever scorer

In [32]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_dev_predicted'
working_dir = 'working/data/'
k = 5
test_data_file = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
predicted_results = jlr.read(test_data_file)

100%|██████████| 9984/9984 [00:00<00:00, 43543.06it/s]


In [33]:
def fever_score(predicted_results):
    strictly_correct = 0
    correct = 0
    cnt = 0
    for d in tqdm(predicted_results):
        true_label = d['true_label']
        predicted_label = d['predicted_label']
        true_evidence = d['orig']
        predicted_evidence = d['pred']
        te = {}
        pe = {}
        #is correct?
        if (true_label == predicted_label):
            correct += 1
            # is strictly correct?
            if (true_label != '0') and (true_label==predicted_label):
                for eg in true_evidence:
                    for e in eg:
                        if e[2] in te:
                            te[e[2]].append(e[3])
                        else:
                            te[e[2]]= [e[3]]    

                for e in predicted_evidence:
                    if e[2] in pe:
                        pe[e[2]].append(e[3])
                    else:
                        pe[e[2]]= [e[3]]

                # for each annotated evidence, see if we predicted the evidences
                # did we correctly predict all pages?
                all_pages = all([k1 in pe.keys() for k1 in te.keys()])
                if all_pages:
                    #for the pages we predicted, did we predict all the sentences?
                    for k in te.keys():
                        if k in pe: # the page is predicted
                            true_sents = np.unique(te[k])
                            pre_sents = np.unique(pe[k][0])
                            #if all the true sentences were predicted
                            match = all([actual_sent in pre_sents for actual_sent in true_sents])
                            #if match and (len(true_sents) == len(pre_sents)):
                            #we are predicting 5 lines per page, so the count may not match with the true evidence lines
                            if match:
                                strictly_correct += 1
            elif (true_label == '0') and (true_label == predicted_label): # not enough info
                    strictly_correct += 1
    noevscore = np.round(correct/len(predicted_results)*100,2)
    score = np.round(strictly_correct/len(predicted_results)*100,2)
    print("noevscore={}, score={}".format(noevscore, score))
    return noevscore, score

In [34]:
fever_score(predicted_results)

100%|██████████| 9984/9984 [00:00<00:00, 49454.78it/s]

noevscore=56.39, score=35.96





(56.39, 35.96)

#### From original FEVER paper
The classification
accuracy is <b>32.57%</b>. Ignoring the requirement for
correct evidence (NoScoreEv) the accuracy is
<b>52.09%</b>.