In [1]:
import json
import numpy as np
from transformers import AutoTokenizer, BertForSequenceClassification, AlbertForSequenceClassification, AdamW

import torch
from torch.utils.data import DataLoader

from fever.scorer import fever_score
from prettytable import PrettyTable
from collections import Counter

import time

# doc retrieval

In [2]:
#load training data with claim, label, and predicted sentences from the wiki articles  
# data preprocessed by UKPLab: https://github.com/UKPLab/fever-2018-team-athene

all_train_list = []
with open('data/all_train.json', 'r') as all_train:
    for example in all_train:
        instance = json.loads(example.strip())
        all_train_list.append(instance)

In [6]:
all_train_list[8880]

{'id': 111134,
 'claim': 'Chinese people are people associated with Ireland.',
 'evidence': [['People',
   0,
   'A people is a plurality of persons considered as a whole , as is the case with an ethnic group or nation .',
   0],
  ['People',
   1,
   'Collectively , for example , the contemporary Frisians and Danes are two related Germanic peoples , while various Middle Eastern ethnic groups are often linguistically categorized as Semitic peoples .',
   0],
  ['Ireland',
   0,
   'Ireland LRB LSB ˈaɪərlənd RSB Éire LSB ˈeːɾʲə RSB ; Ulster Scots : Airlann LSB scoˈɑːrlən RSB RRB is an island in the North Atlantic .',
   0],
  ['Ireland',
   1,
   "It is separated from Great Britain to its east by the North Channel , the Irish Sea , and St George 's Channel .",
   0],
  ['Ireland',
   2,
   'Ireland is the second largest island of the British Isles , the third largest in Europe , and the twentieth largest on Earth .',
   0],
  ['Ireland',
   5,
   'Politically , Ireland is divided betwee

In [7]:
all_dev_list = []
with open('data/all_dev.json', 'r') as all_dev:
    for example in all_dev:
        instance = json.loads(example.strip())
        all_dev_list.append(instance)

In [25]:
len(all_dev_list)

19998

In [43]:
all_train_mini = all_train_list[:5]
all_dev_mini = all_dev_list[:2]

In [45]:
len(all_dev_mini[1]['evidence'])

29

## sentence retrieval

### prep the data

In [53]:
#train_list = all_train_list
#dev_list = all_dev_list

train_list = all_train_mini
dev_list = all_dev_mini

In [10]:
train_list[0]
#train_list[0]['evidence'][9]

{'id': 75397,
 'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'evidence': [['Waldau', 0, 'Waldau may refer to :', 0],
  ['Waldau', 3, 'Waldau LRB surname RRB', 0],
  ['Waldau',
   5,
   'Waldau LRB Bern RRB , a psychiatric clinic in Berne , Switzerland',
   0],
  ['Waldau',
   7,
   'Waldau LRB Burgenlandkreis RRB , a village and a former municipality in Saxony Anhalt , Germany',
   0],
  ['Waldau',
   9,
   'Waldau LRB Kassel RRB , in Northern Hesse , near the border with Lower Saxony and Thuringia , Germany',
   0],
  ['Waldau',
   11,
   'Waldau LRB Titisee Neustadt RRB , a village in Baden Württemberg , Germany',
   0],
  ['Waldau',
   13,
   'Waldau LRB Victoria RRB , a suburb of Melbourne , Australia',
   0],
  ['Waldau',
   15,
   'Waldau LRB Vojvodina RRB , the German name for the village of Sonta , Serbia',
   0],
  ['Nikolaj_Coster-Waldau',
   0,
   'Nikolaj Coster Waldau LRB LSB neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ RSB ; born 27 July 1970 RRB is a Danish

In [11]:
sent1_list_train = [] #sent1: evidence candidate
sent2_list_train = [] #sent2: claim
label_list_train = [] #label: 1 if is evidence, else 0

for data in train_list:
    sent2 = data['claim']
    for evidence in data['evidence']:
        sent1 = evidence[2]
        sent1_list_train.append(sent1)
        sent2_list_train.append(sent2)
        label = evidence[3]
        if label == 0:
            label_list_train.append(0)
        else:
            label_list_train.append(1)

In [None]:
#sent1_list_train

In [None]:
#sent2_list_train

In [None]:
#label_list_train

In [12]:
sent1_list_dev = [] #sent1: evidence candidate
sent2_list_dev = [] #sent2: claim
label_list_dev = [] #label: 1 if is evidence, else 0

for data in dev_list:
    sent2 = data['claim']
    for evidence in data['evidence']:
        sent1 = evidence[2]
        sent1_list_dev.append(sent1)
        sent2_list_dev.append(sent2)
        label = evidence[3]
        if label == 0:
            label_list_dev.append(0)
        else:
            label_list_dev.append(1)

In [None]:
#sent2_list_dev

In [13]:
#tokenizer 
model_name = 'bert-base-uncased'
#model_name = 'albert-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [54]:
train_encodings = tokenizer(sent1_list_train, sent2_list_train, padding=True, truncation=True, return_tensors="pt")
dev_encodings = tokenizer(sent1_list_dev, sent2_list_dev, padding=True, truncation=True, return_tensors="pt")

In [15]:
class FeverDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
train_dataset = FeverDataset(train_encodings, label_list_train)
dev_dataset = FeverDataset(dev_encodings, label_list_dev)

In [17]:
train_dataset[13]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'input_ids': tensor([  101,  2002,  2059,  2209,  6317,  2198,  7598,  1999,  1996,  2460,
          2973,  4419,  2547,  2186,  2047,  7598,  1048, 15185,  2263, 25269,
          2497,  1010,  2004,  2092,  2004,  6037,  2004,  3581, 12694,  1999,
          1996,  2268,  4419,  2547,  2143,  7484,  3012,  1010,  2761,  3832,
          2004,  1037,  4405,  1012,   102, 24794,  3501,  3465,  2121,  1011,
         24547,  2850,  2226,  2499,  2007,  1996,  4419,  5062,  2194,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

### train

In [18]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
#model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
optim = AdamW(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [19]:
start = time.time()
print ('start training...................................................')

for epoch in range(1):
    print("epoch: ", epoch)
    for batch in train_loader:
        #print("batch")
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
end = time.time()
time_to_train = end-start
print('training time: ', time_to_train, 'seconds')

start training...................................................
epoch:  0


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


training time:  247.60690212249756 seconds


### pred the top 5 sentences for each claim

In [20]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [55]:
#inputs = train_encodings
inputs = dev_encodings

In [56]:
outputs = model(**inputs)

In [57]:
classification_logits = outputs.logits.detach().numpy()

In [58]:
classification_logits.shape

(28, 2)

In [None]:
pred_sent_train = []

n = 0
for claim in train_list:
    sent_top5 = {'id':claim['id'], 'label':claim['label'], "claim":claim['claim'], 'pred.sent':[]}
    sent_with_Score = []
    for i in range(len(claim['evidence'])):
        score = classification_logits[n][1]
        sent_with_Score.append([score,claim['evidence'][i]])
        n = n+1
        sent_top5['pred.sent'] = sorted(sent_with_Score, key=lambda t: t[0], reverse=True)[:5]
    pred_sent_dev.append(sent_top5)

In [None]:
pred_sent_dev = []

n = 0
for claim in dev_list:
    sent_top5 = {'id':claim['id'], 'label':claim['label'], "claim":claim['claim'], 'pred.sent':[]}
    sent_with_Score = []
    for i in range(len(claim['evidence'])):
        score = classification_logits[n][1]
        sent_with_Score.append([score,claim['evidence'][i]])
        n = n+1
        sent_top5['pred.sent'] = sorted(sent_with_Score, key=lambda t: t[0], reverse=True)[:5]
    pred_sent_dev.append(sent_top5)

In [52]:
pred_sent_dev

[{'id': 91198,
  'label': 91198,
  'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
  'pred.sent': [[0.009892965,
    ['National_League_-LRB-English_football-RRB-',
     2,
     'Between 1986 and 2015 , the league was known as the Football Conference .',
     0]],
   [-0.0075979084,
    ['National_League_-LRB-English_football-RRB-',
     9,
     'The National League North and National League South form the sixth tier of English football .',
     0]],
   [-0.019916832,
    ['National_League_-LRB-English_football-RRB-',
     7,
     'The professional clubs are usually clubs which have been in the English Football League LRB EFL RRB in the past , as opposed to those who have always been non League .',
     0]],
   [-0.022101209,
    ['National_League_-LRB-English_football-RRB-',
     3,
     'As part of a sponsorship deal with car leasing company Vanarama , the league is known as the Vanarama National League .',
     

In [61]:
#with open('output_sent-retrieval/bert/sentences.predicted.train.jsonl', 'w') as fp:
#with open('output_sent-retrieval/albert/sentences.predicted.train.jsonl', 'w') as fp:
with open('output_sent-retrieval/bert/sentences.predicted.dev.jsonl', 'w') as fp:
#with open('output_sent-retrieval/albert/sentences.predicted.dev.jsonl', 'w') as fp:
    
    #json.dump(str(pred_sent_train), fp)
    json.dump(str(pred_sent_dev), fp)

# claim verification

In [63]:
# load the top 5 sent for each claim

sent_train_list = list()
with open('output_sent-retrieval/bert/sentences.predicted.train.jsonl', 'r') as sent_train:
#with open('output_sent-retrieval/albert/sentences.predicted.train.jsonl', 'r') as sent_train:

    for example in sent_train:
        instance = json.loads(example.strip())
        sent_train_list.append(instance)

In [None]:
#sent_train_list[0]

In [64]:
sent_dev_list = list()
with open('output_sent-retrieval/bert/sentences.predicted.dev.jsonl', 'r') as sent:
#with open('output_sent-retrieval/albert/sentences.predicted.dev.jsonl', 'r') as sent:

    for example in sent:
        instance = json.loads(example.strip())
        sent_dev_list.append(instance)

In [65]:
sent_dev_list[0]

{'id': 91198,
 'verifiable': 'NOT VERIFIABLE',
 'label': 'NOT ENOUGH INFO',
 'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
 'evidence': [[[108548, None, None, None]]],
 'predicted_sentences': [[0.7208946,
   ['Colin_Kaepernick',
    6,
    "He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens ."]],
  [0.06825966,
   ['Colin_Kaepernick',
    0,
    'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .']],
  [0.06412292,
   ['Colin_Kaepernick',
    8,
    'In the following seasons , Kaepernick lost and won back his starting job , with the 49ers missing the playoffs for three years consecutively .']],
  [0.06392619,
   ['Colin_Kaepernick',
    5,
    "Kaepernick began his professional career 

In [66]:
sent_train_mini = sent_train_list[:5]
sent_dev_mini = sent_dev_list[:1]

In [67]:
#train_list = all_train_list
#dev_list = all_dev_list

train_list = sent_train_mini
dev_list = sent_dev_mini

In [68]:
sent1_list_train = []
sent2_list_train = []
label_list_train = []

for data in train_list:
    label = data['label']
    sent2 = data['claim']
    for evidence in data['predicted_sentences']:
        sent1 = evidence[1][2]
        sent1_list_train.append(sent1)
        sent2_list_train.append(sent2)
        if label == "NOT ENOUGH INFO" :
            label_list_train.append(0)
        if label == "REFUTES":
            label_list_train.append(1)
        if label == "SUPPORTS":
            label_list_train.append(2)


In [69]:
#sent1_list_train

In [70]:
sent1_list_dev = []
sent2_list_dev = []
label_list_dev = []

for data in dev_list:
    label = data['label']
    sent2 = data['claim']
    for evidence in data['predicted_sentences']:
        sent1 = evidence[1][2]
        sent1_list_dev.append(sent1)
        sent2_list_dev.append(sent2)
        if label == "NOT ENOUGH INFO" :
            label_list_dev.append(0)
        if label == "REFUTES":
            label_list_dev.append(1)
        if label == "SUPPORTS":
            label_list_dev.append(2)

In [71]:
sent1_list_dev

["He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .",
 'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .',
 'In the following seasons , Kaepernick lost and won back his starting job , with the 49ers missing the playoffs for three years consecutively .',
 "Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion .",
 'The National Football League -LRB- NFL -RRB- is a professional American football league consisting of 32 teams , divided equally between the National Football Conference -LRB- NFC -RRB- and the American Football Conference -LRB- AFC -RRB- .']

In [72]:
#tokenizer 
model_name = 'bert-base-uncased'
#model_name = 'albert-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [73]:
train_encodings = tokenizer(sent1_list_train, sent2_list_train, padding=True, truncation=True, return_tensors="pt")
dev_encodings = tokenizer(sent1_list_dev, sent2_list_dev, padding=True, truncation=True, return_tensors="pt")

### train

In [74]:
train_dataset = FeverDataset(train_encodings, label_list_train)
dev_dataset = FeverDataset(dev_encodings, label_list_dev)

In [76]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
#model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.to(device)
model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [77]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

optim = AdamW(model.parameters(), lr=2e-5)

In [78]:
start = time.time()
print ('start training...................................................')

for epoch in range(1):
    print("epoch")
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
end = time.time()
time_to_train = end-start
print('training time: ', time_to_train, 'seconds')

start training...................................................
epoch


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


training time:  19.757014751434326 seconds


In [79]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [80]:
#inputs = train_encodings
inputs = dev_encodings

In [81]:
outputs = model(**inputs)

In [82]:
#train_classification_logits = outputs.logits.detach().numpy()
dev_classification_logits = outputs.logits.detach().numpy()

#train_classification_logits
dev_classification_logits

array([[ 0.50237155, -0.06594548,  0.03114632],
       [ 0.5220732 , -0.03692114,  0.04443705],
       [ 0.5190381 , -0.04177365,  0.03736544],
       [ 0.52316904, -0.04150811,  0.04119575],
       [ 0.1020293 , -0.24614841, -0.22621267]], dtype=float32)

In [83]:
#predictions = np.argmax(train_classification_logits, axis=1)
predictions = np.argmax(dev_classification_logits, axis=1)

predictions

array([0, 0, 0, 0, 0])

In [84]:
#train_list
dev_list

[{'id': 91198,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
  'evidence': [[[108548, None, None, None]]],
  'predicted_sentences': [[0.7208946,
    ['Colin_Kaepernick',
     6,
     "He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens ."]],
   [0.06825966,
    ['Colin_Kaepernick',
     0,
     'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .']],
   [0.06412292,
    ['Colin_Kaepernick',
     8,
     'In the following seasons , Kaepernick lost and won back his starting job , with the 49ers missing the playoffs for three years consecutively .']],
   [0.06392619,
    ['Colin_Kaepernick',
     5,
     "Kaepernick began his

In [None]:
claim_veri_train = []

n = 0
for claim in train_list:
    claim_veri_pred = {'id':claim['id'], 'pred_label':'REFUTES', "claim":claim['claim']}
    
    for i in range(len(claim['predicted_sentences'])):
        top5_predictions = []
        if predictions[n] == 0:
            pred_label = "NOT ENOUGH INFO"
        if predictions[n] == 1:
            pred_label = "REFUTES"
        if predictions[n] == 2:
            pred_label = "SUPPORTS"
        
        top5_predictions.append(pred_label)
        counter =  Counter(top5_predictions)
        pred_label = counter.most_common(1)[0][0] 
        
        claim_veri_pred['pred_label'] = pred_label
        
        n = n+1
        
    claim_veri_train.append(claim_veri_pred)

In [102]:
claim_veri_dev = []

n = 0
for claim in dev_list:
    claim_veri_pred = {'id':claim['id'], 'pred_label':'REFUTES', "claim":claim['claim']}
    
    for i in range(len(claim['predicted_sentences'])):
        top5_predictions = []
        if predictions[n] == 0:
            pred_label = "NOT ENOUGH INFO"
        if predictions[n] == 1:
            pred_label = "REFUTES"
        if predictions[n] == 2:
            pred_label = "SUPPORTS"
        
        top5_predictions.append(pred_label)
        counter =  Counter(top5_predictions)
        pred_label = counter.most_common(1)[0][0] 
        
        claim_veri_pred['pred_label'] = pred_label
        
        n = n+1
        
    claim_veri_dev.append(claim_veri_pred)


In [103]:
#claim_veri_train
claim_veri_dev

[{'id': 91198,
  'pred_label': 'NOT ENOUGH INFO',
  'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.'}]

In [104]:
#with open('output_claim-verification/bert/claims.predicted.train.jsonl.jsonl', 'w') as fp:
#with open('output_claim-verification/albert/claims.predicted.train.jsonl.jsonl', 'w') as fp:
with open('output_claim-verification/bert/claims.predicted.dev.jsonl.jsonl', 'w') as fp:
#with open('output_claim-verification/albert/claims.predicted.dev.jsonl.jsonl', 'w') as fp:

    #json.dump(str(claim_veri_train), fp)
    json.dump(str(claim_veri_dev), fp)

# compare the dev results

In [105]:
actual_output = []

with open('actual_output/shared_task_dev.jsonl') as output:
    
    for step, line in enumerate(output):
        instance = json.loads(line.strip())
        actual_output.append(instance)

In [106]:
dev_bert = []

with open('output_claim-verification/bert/claims.predicted.dev.jsonl') as output:
    
    for step, line in enumerate(output):
        instance = json.loads(line.strip())
        dev_bert.append(instance)

In [107]:
dev_albert = []

with open('output_claim-verification/albert/claims.predicted.dev.jsonl') as output:
    
    for step, line in enumerate(output):
        instance = json.loads(line.strip())
        dev_albert.append(instance)

In [108]:
tab = PrettyTable()
tab.field_names = ["Model Name", "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1"]

In [109]:
score,acc,precision,recall,f1 = fever_score(dev_bert, actual_output)

In [110]:
tab.add_row(('BERT', round(score,4),round(acc,4),round(precision,4),round(recall,4),round(f1,4)))

In [111]:
score,acc,precision,recall,f1 = fever_score(dev_albert, actual_output)

In [112]:
tab.add_row(('ALBERT', round(score,4),round(acc,4),round(precision,4),round(recall,4),round(f1,4)))

In [113]:
print(tab)

+------------+-------------+----------------+--------------------+-----------------+-------------+
| Model Name | FEVER Score | Label Accuracy | Evidence Precision | Evidence Recall | Evidence F1 |
+------------+-------------+----------------+--------------------+-----------------+-------------+
|    BERT    |    0.6898   |     0.7407     |       0.8931       |      0.7045     |    0.7877   |
|   ALBERT   |    0.6054   |     0.6564     |       0.859        |      0.6072     |    0.7115   |
+------------+-------------+----------------+--------------------+-----------------+-------------+
