In [1]:
ls working/data/training

label_encoder_train.npz
label_encoder_train.pkl
paper_dev.ns.pages.p5.jsonl
paper_dev_pipeline.ps.pages.p5.jsonl
paper_test.ns.pages.p5.jsonl
paper_test_pipeline.ns.pages.p5.jsonl
paper_test_pipeline.ps.pages.p5.jsonl
paper_test_predicted_pipeline.ps.pages.p5.jsonl
train.ns.pages.p5.jsonl
train.pages.p5.jsonl


In [2]:
!wc -l working/data/training/paper_dev_pipeline.ps.pages.p5.jsonl

5700 working/data/training/paper_dev_pipeline.ps.pages.p5.jsonl


In [3]:
!head -2 working/data/training/paper_dev_pipeline.ps.pages.p5.jsonl

{"id": 91198, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.", "evidence": [[[108548, null, "Colin_Kaepernick", -1, [["Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion ."], []]], [-1, null, "Colin_Kaepernick", -2, [["He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .", "Colin Rand Kaepernick -LRB- -LSB- ` k\u00e6p\u0259rn\u026ak -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .", "In 2016 , Kaepernick gained national attention when he began protesting by not standing while the United States national anthem was being performed before the start of games , m

In [4]:
class LabelSchema:
    def __init__(self,labels):
        self.labels = {self.preprocess(val):idx for idx,val in enumerate(labels)}
        self.idx = {idx:self.preprocess(val) for idx,val in enumerate(labels)}

    def get_id(self,label):
        if self.preprocess(label) in self.labels:
            return self.labels[self.preprocess(label)]
        return None

    def preprocess(self,item):
        return item.lower()

class FEVERLabelSchema(LabelSchema):
    def __init__(self):
        super().__init__(["supports", "refutes", "not enough info"])

def nltk_tokenizer(text):
    return " ".join(word_tokenize(text))

class test_line_formatter():    
    def format(self, lines):
        formatted = []
        for line in tqdm(lines):
            fl = self.format_line(line)
            if fl is not None:
                if isinstance(fl,list):
                    formatted.extend(fl)
                else:
                    formatted.append(fl)
        return formatted

    def format_line(self, line):
        label_schema = FEVERLabelSchema()
        # get the label, i.e. SUPPORTS etc.
        annotation = line["label"]
        if annotation is None:
            annotation = line["verifiable"]
        pages = []
        lines = []
        # did we get the closest sentences to the claim text? is this the sentence or the line number from the doc text?
        if 'predicted_sentences' in line:
            pages.extend([(ev[0], ev[1]) for ev in line["predicted_sentences"]])
        elif 'predicted_pages' in line:
            pages.extend([(ev[0], -1) for ev in line["predicted_pages"]])
        else:
            # only if evidence[0] is > -1, we have relevant predicted lines in evidence[4]
            for evidence_group in line["evidence"]:
                pages.extend([(ev[2], ev[3]) for ev in evidence_group])
            for evidence_group in line["evidence"]:
                for ev in evidence_group:
                    if ev[0] > -1:
                        lines.extend(ev[4][0])
        return {"claim": line["claim"], "evidence": pages, "lines": lines, "label": label_schema.get_id(annotation),
                "label_text": annotation}

In [5]:
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in tqdm(fp.readlines()):
            data.append(json.loads(line.strip()))
        return data

In [6]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_dev'
working_dir = 'working/data/'
k = 5
test_data_file = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
data = jlr.read(test_data_file)

100%|██████████| 5700/5700 [00:00<00:00, 34753.01it/s]


In [7]:
data[:2]

[{'id': 91198,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
  'evidence': [[[108548,
     None,
     'Colin_Kaepernick',
     -1,
     [["Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion ."],
      []]],
    [-1,
     None,
     'Colin_Kaepernick',
     -2,
     [["He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .",
       'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .',
       'In 2016 , Kaepernick gained national attention when he began protesting by not standing while the United States nationa

In [8]:
formatter = test_line_formatter()
formatted_test_data = formatter.format(data)

100%|██████████| 5700/5700 [00:00<00:00, 59288.15it/s]


In [9]:
formatted_test_data[:3]

[{'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
  'evidence': [('Colin_Kaepernick', -1),
   ('Colin_Kaepernick', -2),
   ('Pistol_offense', -2),
   ('2016_San_Francisco_49ers_season', -2),
   ('2014_San_Francisco_49ers_season', -2)],
  'lines': ["Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion ."],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Tilda Swinton is a vegan.',
  'evidence': [('Swinton_-LRB-surname-RRB-', -1),
   ('Swinton_-LRB-surname-RRB-', -2),
   ('Tilda', -2),
   ('We_Need_to_Talk_About_Kevin_-LRB-film-RRB-', -2),
   ('Snowpiercer', -2)],
  'lines': ['Category : Scottish surnames'],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Fox 2000 Pictures released the film Soul Food.',
  'evidence': [('Soul_Food', -2),
   ('Soul_Food_-LRB-film-RRB-', -2),
   

In [50]:
[d for d in formatted_test_data if d['label_text'] == 'NOT ENOUGH INFO'][:5]

[{'claim': 'Grease had bad reviews.',
  'evidence': [('Grease_gun_-LRB-tool-RRB-', -1),
   ('Grease_gun_-LRB-tool-RRB-', -2),
   ('Nasal_sebum', -2),
   ('Grease', -2),
   ('Thermal_interface_material', -2)],
  'lines': ['Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .'],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Legendary Entertainment is the owner of Wanda Cinemas.',
  'evidence': [('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -1),
   ('Wanda_Cinemas', -2),
   ('Wanda_Group', -2),
   ('Legendary_Entertainment', -2),
   ('List_of_shows_produced_by_Legendary_Television', -2)],
  'lines': ['It is a part of the Dalian Wanda Group .',
   'As of 2014 Wang Jianlin is the head of the com

In [10]:
import unicodedata
import re
def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess(w):
        w = unicode_to_ascii(w.lower().strip())
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        w = w.strip()
        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '[START] ' + w + ' [END]'
        return w

In [25]:
for data in formatted_test_data[:2]:
    claim = preprocess(data["claim"])
    lines = data["lines"]
    parts = [claim, " ".join(lines)]
    print (" ".join(parts))
    print("***********")

[START] grease had bad reviews . [END] Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .
***********
[START] ukrainian soviet socialist republic was a founding participant of the un . [END] The coat of arms of the Ukrainian Soviet Socialist Republic was adopted on March 14 , 1919 by the government of the Ukrainian Soviet Socialist Republic and subsequently modified on November 7 , 1928 , January 30 , 1937 and November 21 , 1949 . The rising sun stands for the future of the Soviet Ukrainian nation , the star as well as the hammer and sickle for the victory of communism and the `` world-wide socialist community of states '' . The name of the Ukrainian SSR is shown only in Ukrainian , and reads '' Українська PCP '' . The banner bears the Soviet Union state motto -LRB- `` Workers o

In [11]:
import numpy as np
import tensorflow as tf
def get_test_data_generator():
    for data in formatted_test_data:
        claim = preprocess(data["claim"])
        lines = data["lines"]
        yield claim, " ".join(lines)
        
def get_test_dataset():
    generator = lambda: get_test_data_generator()
    return tf.data.Dataset.from_generator(
            generator, output_signature=(
            tf.TensorSpec(shape=(2, ), dtype=tf.string)))

In [54]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [12]:
import tensorflow_text as text
bert_tokenizer_params=dict(lower_case=True)
vocab_file_out = 'working/data/fever_vocab.txt'
pt_tokenizer = text.BertTokenizer(vocab_file_out, **bert_tokenizer_params)

### One hot encode the labels

In [13]:
import pickle
with open('working/data/training/label_encoder_train.pkl', 'rb') as f:
    le = pickle.load(f)
print(le)
labels = [d['label_text'] for d in formatted_test_data]
labels_enc = le.transform(labels)
labels_enc

LabelEncoder()


array([0, 0, 2, ..., 2, 2, 1])

In [56]:
# from sklearn import preprocessing
# labels = [d['label_text'] for d in formatted_test_data]
# le = preprocessing.LabelEncoder()
# le.fit(labels)
# labels_enc = le.transform(labels)

In [14]:
test_labels = np.zeros(shape=(len(labels_enc),3))
for idx, val in enumerate(labels_enc):
    test_labels[idx][val]=1
print("A peek a the reshaped labels:")
print(test_labels[:5])
print("The datatypes of the training dataset, features={}, labels={}".format(type(labels_enc), type(test_labels)))

A peek a the reshaped labels:
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]
The datatypes of the training dataset, features=<class 'numpy.ndarray'>, labels=<class 'numpy.ndarray'>


In [15]:
lbls = tf.reshape(tf.convert_to_tensor(test_labels, dtype=tf.int32), (test_labels.shape))
lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
lbls_ds

<TensorSliceDataset shapes: (3,), types: tf.int32>

In [16]:
test_ds = get_test_dataset()

In [60]:
for h,e in test_ds.take(2):
    print(h)
    print(".....\n\n")
    print(e)
    print("********\n\n")

tf.Tensor(b'[START] grease had bad reviews . [END]', shape=(), dtype=string)
.....


tf.Tensor(b'Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .', shape=(), dtype=string)
********


tf.Tensor(b'[START] ukrainian soviet socialist republic was a founding participant of the un . [END]', shape=(), dtype=string)
.....


tf.Tensor(b"The coat of arms of the Ukrainian Soviet Socialist Republic was adopted on March 14 , 1919 by the government of the Ukrainian Soviet Socialist Republic and subsequently modified on November 7 , 1928 , January 30 , 1937 and November 21 , 1949 . The rising sun stands for the future of the Soviet Ukrainian nation , the star as well as the hammer and sickle for the victory of communism and the `` world-wide socialist community of states '' . The name of the

In [17]:
test_ds_enc_labls = tf.data.Dataset.zip((test_ds, lbls_ds))
print(test_ds_enc_labls.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Tokenize the test data and prepare the tensors for training

In [18]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = test_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = test_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = test_ds_enc_labls.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
# do not shuffle
dataset_test = d.batch(BATCH_SIZE, drop_remainder=True)
print(dataset_test)
print(dataset_test.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


#### Load the prefilled embedding matrix from glove 300d

In [19]:
npzfile = np.load("working/data/embedding_mappings_300d.npz")
npzfile.files

['arr_0']

In [20]:
embedding_matrix = npzfile['arr_0']

#### Build the network

In [21]:

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.regularizers import *

vocab_size= 8000
dim = 300
inp1 = keras.Input(shape=(None, ), name = "hypothesis")
inp2 = keras.Input(shape=(None, ), name = "evidence")

embedding_hyp_layer = Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False)
embedding_evi_layer = Embedding(
        input_dim=vocab_size+1,
        output_dim=dim,
        weights=[embedding_matrix],
        trainable=False)


x_hyp = embedding_hyp_layer(inp1)
x_hyp = tf.keras.layers.Dropout(0.5)(x_hyp)

x_evi = embedding_evi_layer(inp2)
x_evi = tf.keras.layers.Dropout(0.5)(x_evi)


lstm_layer1 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True))(x_hyp)

lstm_layer2 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True))(x_evi)



F_p, F_h = lstm_layer1, lstm_layer2
Eph = keras.layers.Dot(axes=(2, 2))([F_h, F_p])  # [batch_size, Hsize, Psize]
Eh = Lambda(lambda x: keras.activations.softmax(x))(Eph)  # [batch_size, Hsize, Psize]
Ep = keras.layers.Permute((2, 1))(Eph)  # [batch_size, Psize, Hsize)
Ep = Lambda(lambda x: keras.activations.softmax(x))(Ep)  # [batch_size, Psize, Hsize]
    
    
    
# 4, Normalize score matrix, encoder premesis and get alignment
PremAlign = keras.layers.Dot((2, 1))([Ep, lstm_layer2]) # [-1, Psize, dim]
HypoAlign = keras.layers.Dot((2, 1))([Eh, lstm_layer1]) # [-1, Hsize, dim]
mm_1 = keras.layers.Multiply()([lstm_layer1, PremAlign])
mm_2 = keras.layers.Multiply()([lstm_layer2, HypoAlign])
sb_1 = keras.layers.Subtract()([lstm_layer1, PremAlign])
sb_2 = keras.layers.Subtract()([lstm_layer2, HypoAlign])
    

# concat [a_, a~, a_ * a~, a_ - a~], isto za b_, b~
PremAlign = keras.layers.Concatenate()([lstm_layer1, PremAlign, sb_1, mm_1,])  # [batch_size, Psize, 2*unit]
HypoAlign = keras.layers.Concatenate()([lstm_layer2, HypoAlign, sb_2, mm_2])  # [batch_size, Hsize, 2*unit]


# ff layer w/RELU activation
Compresser = tf.keras.layers.TimeDistributed(Dense(300,
                                   kernel_regularizer=l2(0.0),
                                   bias_regularizer=l2(0.0),
                                   activation='relu'),
                             name='Compresser')

PremAlign = Compresser(PremAlign)
HypoAlign = Compresser(HypoAlign)
    

Decoder = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim), return_sequences=True), name='finaldecoder')


PremAlign = Dropout(0.5)(PremAlign)
HypoAlign = Dropout(0.5)(HypoAlign)
final_p = Decoder(PremAlign)
final_h = Decoder(HypoAlign)


AveragePooling = tf.keras.layers.GlobalAveragePooling1D()
MaxPooling = tf.keras.layers.GlobalMaxPooling1D()

# AveragePooling = Lambda(lambda x: K.mean(x, axis=1)) # outs [-1, dim]
# MaxPooling = Lambda(lambda x: K.max(x, axis=1)) # outs [-1, dim]
avg_p = AveragePooling(final_p)
avg_h = AveragePooling(final_h)
max_p = MaxPooling(final_p)
max_h = MaxPooling(final_h)
# concat of avg and max pooling for hypothesis and premise
Final = keras.layers.Concatenate()([avg_p, max_p, avg_h, max_h])
# dropout layer
Final = Dropout(0.5)(Final)
# ff layer w/tanh activation
Final = Dense(100,
              kernel_regularizer=l2(0.0),
              bias_regularizer=l2(0.0),
              name='dense300_',
              activation='tanh')(Final)

# last dropout factor
factor = 1
# if self.LastDropoutHalf:
#     factor = 2
Final = Dropout(0.5 / factor)(Final)

# softmax classifier
Final = Dense(3,
              activation='softmax',
              name='judge300_')(Final)
model = tf.keras.Model(inputs=[inp1, inp2], outputs=Final)

LearningRate = 4e-4
GradientClipping = 10.0

# Optimizer = keras.optimizers.Adam(lr = LearningRate,
#             clipnorm = GradientClipping)

model.compile(loss='categorical_crossentropy',
          optimizer='adam',
          metrics=['accuracy'])

model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hypothesis (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    2400300     hypothesis[0][0]                 
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2400300     evidence[0][0]                   
______________________________________________________________________________________________

In [24]:
# checkpoint_filepath = 'tmp/attention_esim/checkpoint_fever_rte_esim'
# model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_filepath,
#     save_weights_only=True,
#     monitor='val_accuracy',
#     mode='max',
#     save_best_only=True)

# stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

#### Check the test accuracy

In [22]:
checkpoint_filepath = 'tmp/attention_esim/checkpoint_fever_rte_esim'
model.load_weights(checkpoint_filepath)
model.evaluate(dataset_test)



[1.9135409593582153, 0.5647823214530945]

#### Calculate the FEVER score

- Strictly correct: when all the evidences predicted are correct and the predicted label is correct
- Correct: when only the predicted label is correct 

In [23]:
y_pred_proba = model.predict(dataset_test)

In [24]:
y_pred = np.argmax(y_pred_proba, axis = 1)

In [25]:
y_pred[:5]

array([0, 0, 0, 0, 2])

In [26]:
ds_y = dataset_test.map(lambda f, l: l)
y_test_onehot = []
for d in ds_y.batch(1):
    for d1 in d:
        y_test_onehot.append(d1.numpy())

In [27]:
y_test = np.array([np.argmax(a, axis=1) for a in y_test_onehot]).flatten()

In [28]:
y_test[:4]

array([0, 0, 2, 0])

In [176]:
[d['label_text'] for d in formatted_test_data[:4]]

['NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'REFUTES']

In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
#['NOT ENOUGH INFO', 'REFUTES', 'SUPPORTS'] == [0, 1, 2]

              precision    recall  f1-score   support

           0       0.47      0.87      0.61      1863
           1       0.82      0.31      0.45      1902
           2       0.66      0.52      0.58      1931

    accuracy                           0.56      5696
   macro avg       0.65      0.57      0.55      5696
weighted avg       0.65      0.56      0.55      5696



In [4]:
!head -3 working/data/training/paper_test_pipeline.ps.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1, [["Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture ."], []]], [-1, null, "Grease_gun_-LRB-tool-RRB-", -2, [[], []]], [-1, null, "Nasal_sebum", -2, [[], []]], [-1, null, "Grease", -2, [[], []]], [-1, null, "Thermal_interface_material", -2, [[], []]]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Emblem_of_the_Ukrainian_Soviet_Socialist_Republic", -2, [["The coat of arms of the Ukrainian Soviet Socialist Republic was adopted on March 14 , 1919 by the government of the Ukrain

In [3]:
!head working/data/training/paper_test.ns.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Ukrainian_Soviet_Socialist_Republic", 7]], [[298602, 290067, "Ukrainian_Soviet_Socialist_Republic", 7], [298602, 290067, "United_Nations", 0]], [[300696, 291816, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344347, 327887, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344994, 328433, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344997, 328435, "Ukrainian_Soviet_Socialist_Republic", 7]]]}
{"id": 70041, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "2 Hearts is a musical composition by Minogue.", "evidence": [[[225394, 230056, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]], [[317953, 306972, "2_Hearts_-LRB-Kylie_Minogue_song

In [130]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_dev'
working_dir = 'working/data/'
k = 5
test_data_file = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
data = jlr.read(test_data_file)

100%|██████████| 9999/9999 [00:00<00:00, 22138.21it/s]


In [132]:
data[:1]

[{'id': 113501,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Grease had bad reviews.',
  'evidence': [[[133128,
     None,
     'Grease_gun_-LRB-tool-RRB-',
     -1,
     [['Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .'],
      []]],
    [-1, None, 'Grease_gun_-LRB-tool-RRB-', -2, [[], []]],
    [-1, None, 'Nasal_sebum', -2, [[], []]],
    [-1, None, 'Grease', -2, [[], []]],
    [-1, None, 'Thermal_interface_material', -2, [[], []]]]]}]

In [137]:
len(data[:9984])

9984

In [135]:
len(y_pred)

9984

In [178]:
len(y_test)

9984

In [146]:
formatted_test_data[2:4]

[{'claim': '2 Hearts is a musical composition by Minogue.',
  'evidence': [('2_Hearts_-LRB-Kylie_Minogue_song-RRB-', -2),
   ('Kylie_Minogue_singles_discography', -2),
   ('Kylie_Minogue', -2),
   ('X_-LRB-Kylie_Minogue_album-RRB-', -2),
   ('2_Hearts_-LRB-Kylie_Minogue_song-RRB-', -2)],
  'lines': ["`` 2 Hearts '' is a song recorded by Australian singer Kylie Minogue for her tenth studio album , X -LRB- 2007 -RRB- .",
   "The music video for `` 2 Hearts '' was directed by Dawn Shadforth and filmed at Shepperton Studios in London , England .",
   "Upon its release , `` 2 Hearts '' received mixed reviews from music critics .",
   "The song was praised for its departure of musical content and the song 's strength , however the song received criticism for the song 's production and felt it did n't live up to expectations .",
   "The song was Minogue 's first commercial single since she was diagnosed with breast cancer in May 2005 .",
   "It produced five singles , including the Australian

In [133]:
ls working/data/training

label_encoder_train.npz       paper_test_pipeline.ns.pages.p5.jsonl
label_encoder_train.pkl       paper_test_pipeline.ps.pages.p5.jsonl
paper_dev.ns.pages.p5.jsonl   train.ns.pages.p5.jsonl
paper_test.ns.pages.p5.jsonl  train.pages.p5.jsonl


In [30]:
#!head -2 working/data/training/paper_test.ns.pages.p5.jsonl
#re-read the original data, for the annotated evidences
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_dev'
working_dir = 'working/data/'
k = 5
test_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
data_orig = jlr.read(test_data_file)
orig_evidences = [d['evidence'] for d in data_orig[:len(y_test)]]

100%|██████████| 9999/9999 [00:00<00:00, 178577.16it/s]


In [31]:
split = 'paper_dev_predicted'
k = 5
# cnt = 0
with open(working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
    print("Saving to training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k))
    for rec, orig, true_label, predicted_label in tqdm(zip(data[:len(y_test)], orig_evidences, y_test, y_pred)):
        o = {}
        o['id'] = rec['id']
        o['label'] = rec['label']
        evs = []
        for evidence_group in rec['evidence']:
            for evidence in evidence_group:
                if evidence[0] > -1:
                    ev = [evidence[0], evidence[1], evidence[2], evidence[4][1]]
                    evs.append(ev)
            #o['evidence'] = d['evidence']
        #print(o)
        out = {'true_label': str(true_label), 'predicted_label': str(predicted_label), 'orig': orig, 'pred': evs}
#         cnt += 1
#         if cnt > 9:
#             break
        f_out.write(json.dumps(out) + "\n")

5696it [00:00, 79683.93it/s]

Saving to training/paper_dev_predicted_pipeline.ps.pages.p5.jsonl





In [32]:
!wc -l working/data/training/paper_dev_predicted_pipeline.ps.pages.p5.jsonl

5696 working/data/training/paper_dev_predicted_pipeline.ps.pages.p5.jsonl


In [33]:
!head working/data/training/paper_dev_predicted_pipeline.ps.pages.p5.jsonl

{"true_label": "0", "predicted_label": "0", "orig": [[[108548, null, "Colin_Kaepernick", -1]]], "pred": [[108548, null, "Colin_Kaepernick", []]]}
{"true_label": "0", "predicted_label": "0", "orig": [[[227768, null, "Swinton_-LRB-surname-RRB-", -1]]], "pred": [[227768, null, "Swinton_-LRB-surname-RRB-", []]]}
{"true_label": "2", "predicted_label": "0", "orig": [[[289914, 283015, "Soul_Food_-LRB-film-RRB-", 0]], [[291259, 284217, "Soul_Food_-LRB-film-RRB-", 0]], [[293412, 285960, "Soul_Food_-LRB-film-RRB-", 0]], [[337212, 322620, "Soul_Food_-LRB-film-RRB-", 0]], [[337214, 322622, "Soul_Food_-LRB-film-RRB-", 0]]], "pred": [[289914, 283015, "Soul_Food", [3, 2]], [291259, 284217, "Soul_Food_-LRB-film-RRB-", [0, 4]], [293412, 285960, "Ramona_and_Beezus", [4, 1]], [337212, 322620, "Maxine_Chadway", [1]], [337214, 322622, "John_C._Kilkenny", [1, 12, 0, 5]]]}
{"true_label": "0", "predicted_label": "0", "orig": [[[191656, null, "List_of_Ace_titles_in_numeric_series", -1], [191657, null, "List_of

In [34]:
#!head -2 working/data/training/paper_test.ns.pages.p5.jsonl
#re-read the original data, for the annotated evidences
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_dev_predicted'
working_dir = 'working/data/'
k = 5
#working/data/training/paper_test_predicted_pipeline.ps.pages.p5.jsonl
test_data_file = working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split, k)
predicted_results = jlr.read(test_data_file)

100%|██████████| 5696/5696 [00:00<00:00, 138252.69it/s]


In [35]:
predicted_results[:2]

[{'true_label': '0',
  'predicted_label': '0',
  'orig': [[[108548, None, 'Colin_Kaepernick', -1]]],
  'pred': [[108548, None, 'Colin_Kaepernick', []]]},
 {'true_label': '0',
  'predicted_label': '0',
  'orig': [[[227768, None, 'Swinton_-LRB-surname-RRB-', -1]]],
  'pred': [[227768, None, 'Swinton_-LRB-surname-RRB-', []]]}]

In [36]:
len(predicted_results)

5696

In [38]:
strictly_correct = 0
correct = 0
cnt = 0
for d in tqdm(predicted_results):
    true_label = d['true_label']
    predicted_label = d['predicted_label']
    true_evidence = d['orig']
    predicted_evidence = d['pred']
    te = {}
    pe = {}
    #is correct?
    if (true_label == predicted_label):
        correct += 1
        # is strictly correct?
        if (true_label != '0') and (true_label==predicted_label):
            for eg in true_evidence:
                for e in eg:
                    if e[2] in te:
                        te[e[2]].append(e[3])
                    else:
                        te[e[2]]= [e[3]]    

            for e in predicted_evidence:
                if e[2] in pe:
                    pe[e[2]].append(e[3])
                else:
                    pe[e[2]]= [e[3]]

            # for each annotated evidence, see if we predicted the evidences
            # did we correctly predict all pages?
            all_pages = all([k1 in pe.keys() for k1 in te.keys()])
            if all_pages:
                #for the pages we predicted, did we predict all the sentences?
                for k in te.keys():
                    if k in pe: # the page is predicted
                        true_sents = np.unique(te[k])
                        pre_sents = np.unique(pe[k][0])
                        #if all the true sentences were predicted
                        match = all([actual_sent in pre_sents for actual_sent in true_sents])
                        #if match and (len(true_sents) == len(pre_sents)):
                        #we are predicting 5 lines per page, so the count may not match with the true evidence lines
                        if match:
                            strictly_correct += 1
        elif (true_label == '0') and (true_label == predicted_label): # not enough info
                strictly_correct += 1
noevscore = np.round(correct/len(predicted_results)*100,2)
score = np.round(strictly_correct/len(predicted_results)*100,2)
print("noevscore={}, score={}".format(noevscore, score))

100%|██████████| 5696/5696 [00:00<00:00, 177515.57it/s]

noevscore=56.48, score=35.55





#### From original FEVER paper
The classification
accuracy is <b>32.57%</b>. Ignoring the requirement for
correct evidence (NoScoreEv) the accuracy is
<b>52.09%</b>.