#### Evaluate the model on dev dataset

The test dataset will use predicted pages and predicted sentences. 

The predictions are generated via a seperate process in the our pipeline which must be executed before this step.


In [1]:
from mda.src.dataset.DatasetReader import DatasetReader

We need the label encoder, we will generate them from the training data

In [2]:
infile = 'working/data/training/train.ns.pages.p5.jsonl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=None, database_path='data/data/fever/fever.db')
raw, data = dsreader.read()

100%|██████████| 145449/145449 [00:01<00:00, 83508.15it/s] 
100%|██████████| 145449/145449 [00:01<00:00, 141248.98it/s]


In [3]:
ds_train = dsreader.get_dataset()

In [4]:
## save the label encoder
import pickle
with open('working/data/training/label_encoder_train.pkl', 'wb') as f:
    pickle.dump(dsreader.labelencoder, f)

### Load dev data
Use the saved label encodings from training

In [5]:
infile = 'working/data/training/paper_dev_pipeline.ps.pages.p5.jsonl'
label_checkpoint_file = 'working/data/training/label_encoder_train.pkl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=label_checkpoint_file, database_path='data/data/fever/fever.db', type='test')
raw_test, test_data = dsreader.read()

100%|██████████| 9999/9999 [00:00<00:00, 17616.52it/s]
100%|██████████| 9999/9999 [00:00<00:00, 133225.90it/s]


In [6]:
ds_test = dsreader.get_dataset()
print(ds_test.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Load the BERT tokenizer

The FEVER vocab file is build using tokens that were concatenations of the train and the dev dataset

In [7]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow_text as text
bert_tokenizer_params=dict(lower_case=True)
vocab_file_out = 'working/data/fever_vocab.txt'
pt_tokenizer = text.BertTokenizer(vocab_file_out, **bert_tokenizer_params)

#### Prepare the tensor dataset for evaluation

In [8]:
import tensorflow as tf
import numpy as np

In [9]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = ds_test.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = ds_test.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = ds_test.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
# do not shuffle
dataset_test = d.batch(BATCH_SIZE, drop_remainder=True)
print(dataset_test)
print(dataset_test.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


#### Load the prefilled embedding matrix from glove 300d

In [10]:
npzfile = np.load("working/data/embedding_mappings_300d.npz")
npzfile.files

['arr_0']

In [11]:
embedding_matrix = npzfile['arr_0']

from mda.src.model.esim import esim#### Build the network

In [12]:
from mda.src.model.esim import esim

In [13]:
esim_model = esim(embedding_matrix=embedding_matrix, vocab_size = 8000, embedding_dim=300, alignment_dense_dim=300, final_dense_dim=100)
model = esim_model.build_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hypothesis (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    2400300     hypothesis[0][0]                 
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2400300     evidence[0][0]                   
______________________________________________________________________________________________

#### Evaluate model accuracy on DEV dataset

In [14]:
checkpoint_filepath = 'tmp/attention_esim/checkpoint_fever_rte_esim'
model.load_weights(checkpoint_filepath)
model.evaluate(dataset_test)



[1.9557477235794067, 0.5500801205635071]

#### Calculate the FEVER score

- Strictly correct: when all the evidences predicted are correct and the predicted label is correct
- Correct: when only the predicted label is correct 

#### Compute the precision and recall

In [15]:
y_pred_proba = model.predict(dataset_test)

In [16]:
y_pred = np.argmax(y_pred_proba, axis = 1)

In [17]:
outfile = 'working/data/dev_y_preds.npz'
np.savez(outfile, y_pred)

In [18]:
ds_y = dataset_test.map(lambda f, l: l)
y_test_onehot = []
for d in ds_y.batch(1):
    for d1 in d:
        y_test_onehot.append(d1.numpy())
y_test = np.array([np.argmax(a, axis=1) for a in y_test_onehot]).flatten()

In [19]:
outfile = 'working/data/dev_y_tests.npz'
np.savez(outfile, y_test)

In [20]:
[d['label_text'] for d in test_data[:4]]

['NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'NOT ENOUGH INFO']

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
#['NOT ENOUGH INFO', 'REFUTES', 'SUPPORTS'] == [0, 1, 2]

              precision    recall  f1-score   support

           0       0.46      0.88      0.60      3325
           1       0.78      0.30      0.43      3331
           2       0.66      0.48      0.56      3328

    accuracy                           0.55      9984
   macro avg       0.63      0.55      0.53      9984
weighted avg       0.63      0.55      0.53      9984



For <b>fever score</b>, we will need to compare the labels and the evidences.

First we need to extract the true labels and evidences from the original training dataset.

In the original dataset, we will need to sample data for the NEI class just like we did for our original training

Note: the dataset root path should match what's in /loca/fever-common/ in the 'fever-common' container

We should have already generated the 'ns' file (i.e. **paper_dev.ns.pages.p5.jsonl**) during our training. We do not need to re-generate this file, this step below is just for the sake of completeness. 

In [22]:
from mda.src.dataset.DatasetGenerator import DatasetGenerator

In [26]:
ds_generator = DatasetGenerator(dataset_root='data/data/',out_dir='working/data/out/', database_path='data/data/fever/fever.db')

In [27]:
ds_generator.generate_nei_evidences('paper_dev', 5)

  0%|          | 2/9999 [00:00<08:35, 19.41it/s]

Writing data to working/data/out//paper_dev.ns.pages.p5.jsonl


100%|██████████| 9999/9999 [02:20<00:00, 71.38it/s] 


We need the gold evidences and the predicted evidences along with the label predictions to compute the fever score.

In [25]:
from tqdm import tqdm
import json
from mda.src.utils.readers import JSONLineReader
from mda.src.utils.eval import *
working_dir = 'working/data/'

Read the original / gold evidences from the dev file. The predicted evidences are already available in the **raw_test** dataset we read from the **paper_dev_pipeline.ps.pages.p5.jsonl** file.

In [26]:
jlr = JSONLineReader()
split = 'paper_dev'
k = 5
test_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
data_orig = jlr.read(test_data_file)
orig_evidences = [d['evidence'] for d in data_orig[:len(y_test)]]

100%|██████████| 9999/9999 [00:00<00:00, 180632.30it/s]


Generate the final predictions, for the label, the predicted pages and the sentences.

In [30]:
split = 'paper_dev_predicted'
k = 5
with open(working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
    print("Saving to training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k))
    for rec, orig, true_label, predicted_label in tqdm(zip(raw_test[:len(y_test)], orig_evidences, y_test, y_pred)):
        evs = []
        for evidence_group in rec['evidence']:
            for evidence in evidence_group:
                if evidence[0] > -1:
                    ev = [evidence[0], evidence[1], evidence[2], evidence[4][1]]
                    evs.append(ev)

        out = {'true_label': str(true_label), 'predicted_label': str(predicted_label), 'orig': orig, 'pred': evs}

        f_out.write(json.dumps(out) + "\n")

9984it [00:00, 78166.40it/s]

Saving to training/paper_dev_predicted_pipeline.ps.pages.p5.jsonl





Load the predictions from the file to use in fever scorer

In [31]:
def read_jsonl_data(filename):
    jlr = JSONLineReader()
    predicted_results = jlr.read(filename)
    return predicted_results
split = 'paper_dev_predicted'
k = 5
filename = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
predicted_results = read_jsonl_data(filename)

100%|██████████| 9984/9984 [00:00<00:00, 146605.42it/s]


In [32]:
compute_fever_score(predicted_results)

100%|██████████| 9984/9984 [00:00<00:00, 125284.76it/s]

noevscore=55.01, score=36.0





(55.01, 36.0)

#### From original FEVER paper
The classification
accuracy is <b>32.57%</b>. Ignoring the requirement for
correct evidence (NoScoreEv) the accuracy is
<b>52.09%</b>.