#### Evaluate the model on test dataset

The test dataset will use predicted pages and predicted sentences. 

The predictions are generated via a seperate process in the our pipeline which must be executed before this step.


#### Structure of the test dataset

format: 
- id: id of the claim
- label: the text label of the example (e.g. SUPPORTS, REFUTES or NOT ENOUGH INFO)
- claim: the claim text
- evidence: array of evidence groups
- evidence group: [evidence id, N/A, Document Id, evidence tag, [array of closest sentences/lines, array of those line ids in the page]]

We will need to read this data, format and extract the evidence, the sentences

In [2]:
from mda.src.dataset.DatasetReader import DatasetReader

#### Load training data

In [3]:
infile = 'working/data/training/train.ns.pages.p5.jsonl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=None, database_path='data/data/fever/fever.db')
raw, data = dsreader.read()

100%|██████████| 145449/145449 [00:01<00:00, 84174.33it/s]
100%|██████████| 145449/145449 [00:01<00:00, 143689.73it/s]


In [5]:
ds_train = dsreader.get_dataset()

In [6]:
## save the label encoder
import pickle
with open('working/data/training/label_encoder_train.pkl', 'wb') as f:
    pickle.dump(dsreader.labelencoder, f)

### Load test data
Use the saved label encodings from training

In [4]:
infile = 'working/data/training/paper_test_pipeline.ps.pages.p5.jsonl'
label_checkpoint_file = 'working/data/training/label_encoder_train.pkl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=label_checkpoint_file, database_path='data/data/fever/fever.db', type='test')
## read the raw and the formatted data
raw_test, test_data = dsreader.read()

100%|██████████| 9999/9999 [00:00<00:00, 17446.99it/s]
100%|██████████| 9999/9999 [00:00<00:00, 137812.57it/s]


In [5]:
ds_test = dsreader.get_dataset()
print(ds_test.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Load the BERT tokenizer

The FEVER vocab file is build using tokens that were concatenations of the train and the dev dataset

In [10]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow_text as text
bert_tokenizer_params=dict(lower_case=True)
vocab_file_out = 'working/data/fever_vocab.txt'
pt_tokenizer = text.BertTokenizer(vocab_file_out, **bert_tokenizer_params)

#### Prepare the tensor dataset for evaluation

In [11]:
import tensorflow as tf
import numpy as np

In [12]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = ds_test.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = ds_test.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = ds_test.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
# do not shuffle
dataset_test = d.batch(BATCH_SIZE, drop_remainder=True)
print(dataset_test)
print(dataset_test.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


In [13]:
ls working/data

claim_texts.jsonl  embedding_mappings_300d.npz    test_y_preds.npz  [0m[01;34mtraining[0m/
[01;34mdev[0m/               fever_vocab.txt                test_y_tests.npz
dev_labels.npz     matching_page_sentences.jsonl  train_labels.npz
dev_x.npz          [01;34mout[0m/                           train_x.npz


#### Load the prefilled embedding matrix from glove 300d

In [13]:
npzfile = np.load("working/data/embedding_mappings_300d.npz")
npzfile.files

['arr_0']

In [14]:
embedding_matrix = npzfile['arr_0']

#### Build the network

In [15]:
from mda.src.model.esim import esim

In [17]:
esim_model = esim(embedding_matrix=embedding_matrix, vocab_size = 8000, embedding_dim=300, alignment_dense_dim=300, final_dense_dim=100)
model = esim_model.build_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hypothesis (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    2400300     hypothesis[0][0]                 
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2400300     evidence[0][0]                   
______________________________________________________________________________________________

#### Check the test accuracy

In [18]:
checkpoint_filepath = 'tmp/attention_esim/checkpoint_fever_rte_esim'
model.load_weights(checkpoint_filepath)
model.evaluate(dataset_test)



[1.7892426252365112, 0.5669070482254028]

#### Calculate the FEVER score

- Strictly correct: when all the evidences predicted are correct and the predicted label is correct
- Correct: when only the predicted label is correct 

#### Compute the precision and recall

In [19]:
y_pred_proba = model.predict(dataset_test)

In [20]:
y_pred = np.argmax(y_pred_proba, axis = 1)

In [21]:
outfile = 'working/data/test_y_preds.npz'
np.savez(outfile, y_pred)

In [22]:
ds_y = dataset_test.map(lambda f, l: l)
y_test_onehot = []
for d in ds_y.batch(1):
    for d1 in d:
        y_test_onehot.append(d1.numpy())
y_test = np.array([np.argmax(a, axis=1) for a in y_test_onehot]).flatten()

In [23]:
outfile = 'working/data/test_y_tests.npz'
np.savez(outfile, y_test)

In [24]:
[d['label_text'] for d in test_data[:4]]

['NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'REFUTES']

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
#['NOT ENOUGH INFO', 'REFUTES', 'SUPPORTS'] == [0, 1, 2]

              precision    recall  f1-score   support

           0       0.49      0.84      0.62      3328
           1       0.74      0.32      0.45      3328
           2       0.64      0.54      0.59      3328

    accuracy                           0.57      9984
   macro avg       0.62      0.57      0.55      9984
weighted avg       0.62      0.57      0.55      9984



In [241]:
!head -1 working/data/training/paper_test_pipeline.ps.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1, [["Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture ."], []]], [-1, null, "Grease_gun_-LRB-tool-RRB-", -2, [[], []]], [-1, null, "Nasal_sebum", -2, [[], []]], [-1, null, "Grease", -2, [[], []]], [-1, null, "Thermal_interface_material", -2, [[], []]]]]}


In [246]:
ls /local/fever-common/data/fever-data/

paper_dev.jsonl   shared_task_dev.jsonl   train.jsonl
paper_test.jsonl  shared_task_test.jsonl


For <b>fever score</b>, we will need to compare the labels and the evidences.

First we need to extract the true labels and evidences from the original training dataset.

In the original dataset, we will need to sample data for the NEI class just like we did for our original training

Note: the dataset root path should match what's in /loca/fever-common/ in the 'fever-common' container


In [22]:
from mda.src.dataset.DatasetGenerator import DatasetGenerator

In [26]:
ds_generator = DatasetGenerator(dataset_root='data/data/',out_dir='working/data/out/', database_path='data/data/fever/fever.db')

In [27]:
ds_generator.generate_nei_evidences('paper_test', 5)

  0%|          | 7/9999 [00:00<02:49, 58.97it/s]

Creating directory working/data/out/
Writing data to working/data/out//paper_test.ns.pages.p5.jsonl


100%|██████████| 9999/9999 [02:47<00:00, 59.85it/s] 


### In dataset type B

We have predicted pages and predicted sentences per page for each claim. We need the predictions for those pages and sentences to compute the FEVER score.

In [26]:
raw_test[:1][0]['evidence']

[[[133128,
   None,
   'Grease_gun_-LRB-tool-RRB-',
   -1,
   [['Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .'],
    []]],
  [-1, None, 'Grease_gun_-LRB-tool-RRB-', -2, [[], []]],
  [-1, None, 'Nasal_sebum', -2, [[], []]],
  [-1, None, 'Grease', -2, [[], []]],
  [-1, None, 'Thermal_interface_material', -2, [[], []]]]]

### In dataset type A

We also need the original annotated pages and the sentences from the original file.

In [73]:
from tqdm import tqdm
import json
from mda.src.utils.readers import JSONLineReader
from mda.src.utils.eval import *
working_dir = 'working/data/'

Read the original / gold evidences from the test file

In [80]:
jlr = JSONLineReader()
split = 'paper_test'
k = 5
test_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
data_orig = jlr.read(test_data_file)
orig_evidences = [d['evidence'] for d in data_orig[:len(y_test)]]

100%|██████████| 9999/9999 [00:00<00:00, 179961.06it/s]


Generate the final predictions, for the label, the predicted pages and the sentences.

In [81]:
split = 'paper_test_predicted'
k = 5
with open(working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
    print("Saving to training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k))
    for rec, orig, true_label, predicted_label in tqdm(zip(raw_test[:len(y_test)], orig_evidences, y_test, y_pred)):
        evs = []
        for evidence_group in rec['evidence']:
            for evidence in evidence_group:
                if evidence[0] > -1:
                    ev = [evidence[0], evidence[1], evidence[2], evidence[4][1]]
                    evs.append(ev)
        out = {'true_label': str(true_label), 'predicted_label': str(predicted_label), 'orig': orig, 'pred': evs}
        f_out.write(json.dumps(out) + "\n")

9984it [00:00, 80206.42it/s]

Saving to training/paper_test_predicted_pipeline.ps.pages.p5.jsonl





In [82]:
!head -1 working/data/training/paper_test_predicted_pipeline.ps.pages.p5.jsonl

{"true_label": "0", "predicted_label": "0", "orig": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1]]], "pred": [[133128, null, "Grease_gun_-LRB-tool-RRB-", []]]}


Load the predictions (we just made) from the file to use in fever scorer

In [84]:
def read_jsonl_data(filename):
    jlr = JSONLineReader()
    predicted_results = jlr.read(filename)
    return predicted_results
split = 'paper_test_predicted'
k = 5
filename = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
predicted_results = read_jsonl_data(filename)

100%|██████████| 9984/9984 [00:00<00:00, 15031.09it/s]


In [32]:
def fever_score(predicted_results):
    strictly_correct = 0
    correct = 0
    cnt = 0
    for d in tqdm(predicted_results):
        true_label = d['true_label']
        predicted_label = d['predicted_label']
        true_evidence = d['orig']
        predicted_evidence = d['pred']
        te = {}
        pe = {}
        #is correct?
        if (true_label == predicted_label):
            correct += 1
            # is strictly correct?
            if (true_label != '0') and (true_label==predicted_label):
                for eg in true_evidence:
                    for e in eg:
                        if e[2] in te:
                            te[e[2]].append(e[3])
                        else:
                            te[e[2]]= [e[3]]    

                for e in predicted_evidence:
                    if e[2] in pe:
                        pe[e[2]].append(e[3])
                    else:
                        pe[e[2]]= [e[3]]

                # for each annotated evidence, see if we predicted the evidences
                # did we correctly predict all pages?
                all_pages = all([k1 in pe.keys() for k1 in te.keys()])
                if all_pages:
                    #for the pages we predicted, did we predict all the sentences?
                    for k in te.keys():
                        if k in pe: # the page is predicted
                            true_sents = np.unique(te[k])
                            pre_sents = np.unique(pe[k][0])
                            #if all the true sentences were predicted
                            match = all([actual_sent in pre_sents for actual_sent in true_sents])
                            #if match and (len(true_sents) == len(pre_sents)):
                            #we are predicting 5 lines per page, so the count may not match with the true evidence lines
                            if match:
                                strictly_correct += 1
            elif (true_label == '0') and (true_label == predicted_label): # not enough info
                    strictly_correct += 1
    noevscore = np.round(correct/len(predicted_results)*100,2)
    score = np.round(strictly_correct/len(predicted_results)*100,2)
    print("noevscore={}, score={}".format(noevscore, score))
    return noevscore, score

In [85]:
compute_fever_score(predicted_results)

100%|██████████| 9984/9984 [00:00<00:00, 193731.93it/s]

noevscore=56.69, score=35.43





(56.69, 35.43)

#### From original FEVER paper
Finally, we predict entailment
using the Decomposable Attention model trained
with the NEARESTP strategy. The classification
accuracy is <b>31.87%</b>. Ignoring the requirement for
correct evidence (NoScoreEv) the accuracy is
<b>50.91%</b>.