In [99]:
import json
import numpy as np
from transformers import AutoTokenizer, BertForSequenceClassification, AlbertForSequenceClassification, AdamW

import torch
from torch.utils.data import DataLoader

from fever.scorer import fever_score
from prettytable import PrettyTable

import time

# doc retrieval

In [107]:
#load training data with claim, label, and predicted sentences from the wiki articles  
# data preprocessed by UKPLab: https://github.com/UKPLab/fever-2018-team-athene

all_train_list = []
with open('data/all_train.json', 'r') as all_train:
    for example in all_train:
        instance = json.loads(example.strip())
        all_train_list.append(instance)

In [110]:
#all_train_list[0]

In [4]:
all_dev_list = []
with open('data/all_dev.json', 'r') as all_dev:
    for example in all_dev:
        instance = json.loads(example.strip())
        all_dev_list.append(instance)

In [111]:
#all_dev_list[0]

In [6]:
all_train_mini = all_train_list[:5]
all_dev_mini = all_dev_list[:1]

## sentence retrieval

### prep the data

In [7]:
#train_list = all_train_list
#dev_list = all_dev_list

train_list = all_train_mini
dev_list = all_dev_mini

In [8]:
train_list[0]['evidence'][9]

['Nikolaj_Coster-Waldau',
 1,
 'He graduated from Danish National School of Theatre in Copenhagen in 1993 .',
 0]

In [9]:
sent1_list_train = [] #sent1: evidence candidate
sent2_list_train = [] #sent2: claim
label_list_train = [] #label: 1 if is evidence, else 0

for data in train_list:
    sent2 = data['claim']
    for evidence in data['evidence']:
        sent1 = evidence[2]
        sent1_list_train.append(sent1)
        sent2_list_train.append(sent2)
        label = evidence[3]
        if label == 0:
            label_list_train.append(0)
        else:
            label_list_train.append(1)

In [112]:
#sent1_list_train

In [113]:
#sent2_list_train

In [114]:
#label_list_train

In [13]:
sent1_list_dev = [] #sent1: evidence candidate
sent2_list_dev = [] #sent2: claim
label_list_dev = [] #label: 1 if is evidence, else 0

for data in dev_list:
    sent2 = data['claim']
    for evidence in data['evidence']:
        sent1 = evidence[2]
        sent1_list_dev.append(sent1)
        sent2_list_dev.append(sent2)
        label = evidence[3]
        if label == 0:
            label_list_dev.append(0)
        else:
            label_list_dev.append(1)

In [None]:
#sent2_list_dev

In [14]:
#tokenizer 
model_name = 'bert-base-uncased'
#model_name = 'albert-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
train_encodings = tokenizer(sent1_list_train, sent2_list_train, padding=True, truncation=True, return_tensors="pt")
dev_encodings = tokenizer(sent1_list_dev, sent2_list_dev, padding=True, truncation=True, return_tensors="pt")

In [16]:
class FeverDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_dataset = FeverDataset(train_encodings, label_list_train)
dev_dataset = FeverDataset(dev_encodings, label_list_dev)

In [18]:
train_dataset[13]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'input_ids': tensor([  101,  2002,  2059,  2209,  6317,  2198,  7598,  1999,  1996,  2460,
          2973,  4419,  2547,  2186,  2047,  7598,  1048, 15185,  2263, 25269,
          2497,  1010,  2004,  2092,  2004,  6037,  2004,  3581, 12694,  1999,
          1996,  2268,  4419,  2547,  2143,  7484,  3012,  1010,  2761,  3832,
          2004,  1037,  4405,  1012,   102, 24794,  3501,  3465,  2121,  1011,
         24547,  2850,  2226,  2499,  2007,  1996,  4419,  5062,  2194,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

### train

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
#model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [20]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [21]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

optim = AdamW(model.parameters(), lr=2e-5)

In [22]:
start = time.time()
print ('start training...................................................')

for epoch in range(1):
    print("epoch")
    for batch in train_loader:
        print("batch")
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
end = time.time()
time_to_train = end-start
print('training time: ', time_to_train, 'seconds')

start training...................................................
epoch
batch


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
training time:  304.0648319721222 seconds


### pred the top 5 sentences for each claim

In [23]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [24]:
inputs = train_encodings

In [25]:
outputs = model(**inputs)

In [26]:
classification_logits = outputs.logits.detach().numpy()

In [115]:
#classification_logits

In [31]:
classification_logits[0][1]

-0.4141359

In [42]:
retrieval_output = train_list
n = 0

for i in range(len(retrieval_output)):
    example = retrieval_output[i]
    for j in range(len(example['evidence'])):
        score = classification_logits[n][1]
        retrieval_output[i]['evidence'][j][3] = score
        n = n+1

In [49]:
test = retrieval_output[0]['evidence']

In [51]:
sorted(test, key=lambda t: t[3], reverse=True)[:5]

[['Nikolaj_Coster-Waldau',
  0,
  'Nikolaj Coster Waldau LRB LSB neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ RSB ; born 27 July 1970 RRB is a Danish actor , producer and screenwriter .',
  -0.3710099],
 ['Nikolaj_Coster-Waldau',
  2,
  "Coster Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch LRB 1994 RRB .",
  -0.38138622],
 ['Fox_Broadcasting_Company',
  14,
  'Fox is a member of the North American Broadcasters Association and the National Association of Broadcasters .',
  -0.3940376],
 ['Fox_Broadcasting_Company',
  0,
  'The Fox Broadcasting Company LRB often shortened to Fox and stylized as FOX RRB is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .',
  -0.40916896],
 ['Waldau_-LRB-surname-RRB-', 0, 'Waldau is a surname .', -0.41178426]]

# claim verification

In [57]:
# load the top 5 sent for each claim

sent_train_list = list()
with open('output_sent-retrieval/bert/sentences.predicted.train.jsonl', 'r') as sent_train:
#with open('output_sent-retrieval/albert/sentences.predicted.train.jsonl', 'r') as sent_train:

    for example in sent_train:
        instance = json.loads(example.strip())
        sent_train_list.append(instance)

In [116]:
#sent_train_list[0]

In [60]:
sent_dev_list = list()
with open('output_sent-retrieval/bert/sentences.predicted.dev.jsonl', 'r') as sent_dev:
#with open('output_sent-retrieval/albert/sentences.predicted.dev.jsonl', 'r') as sent_dev:

    for example in sent_dev:
        instance = json.loads(example.strip())
        sent_dev_list.append(instance)

In [61]:
sent_dev_list[0]

{'id': 91198,
 'verifiable': 'NOT VERIFIABLE',
 'label': 'NOT ENOUGH INFO',
 'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
 'evidence': [[[108548, None, None, None]]],
 'predicted_sentences': [[0.7208946,
   ['Colin_Kaepernick',
    6,
    "He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens ."]],
  [0.06825966,
   ['Colin_Kaepernick',
    0,
    'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .']],
  [0.06412292,
   ['Colin_Kaepernick',
    8,
    'In the following seasons , Kaepernick lost and won back his starting job , with the 49ers missing the playoffs for three years consecutively .']],
  [0.06392619,
   ['Colin_Kaepernick',
    5,
    "Kaepernick began his professional career 

In [62]:
sent_train_mini = sent_train_list[:5]
sent_dev_mini = sent_dev_list[:1]

In [63]:
#train_list = all_train_list
#dev_list = all_dev_list

train_list = sent_train_mini
dev_list = sent_dev_mini

In [79]:
sent1_list_train = []
sent2_list_train = []
label_list_train = []

for data in train_list:
    label = data['label']
    sent2 = data['claim']
    for evidence in data['predicted_sentences']:
        sent1 = evidence[1][2]
        sent1_list_train.append(sent1)
        sent2_list_train.append(sent2)
        if label == "NOT ENOUGH INFO" :
            label_list_train.append(0)
        if label == "REFUTES":
            label_list_train.append(1)
        if label == "SUPPORTS":
            label_list_train.append(2)


In [117]:
#sent1_list_train

In [81]:
sent1_list_dev = []
sent2_list_dev = []
label_list_dev = []

for data in dev_list:
    label = data['label']
    sent2 = data['claim']
    for evidence in data['predicted_sentences']:
        sent1 = evidence[1][2]
        sent1_list_dev.append(sent1)
        sent2_list_dev.append(sent2)
        if label == "NOT ENOUGH INFO" :
            label_list_dev.append(0)
        if label == "REFUTES":
            label_list_dev.append(1)
        if label == "SUPPORTS":
            label_list_dev.append(2)

In [82]:
sent1_list_dev

["He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .",
 'Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .',
 'In the following seasons , Kaepernick lost and won back his starting job , with the 49ers missing the playoffs for three years consecutively .',
 "Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion .",
 'The National Football League -LRB- NFL -RRB- is a professional American football league consisting of 32 teams , divided equally between the National Football Conference -LRB- NFC -RRB- and the American Football Conference -LRB- AFC -RRB- .']

In [83]:
#tokenizer 
model_name = 'bert-base-uncased'
#model_name = 'albert-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [84]:
train_encodings = tokenizer(sent1_list_train, sent2_list_train, padding=True, truncation=True, return_tensors="pt")
dev_encodings = tokenizer(sent1_list_dev, sent2_list_dev, padding=True, truncation=True, return_tensors="pt")

### train

In [85]:
train_dataset = FeverDataset(train_encodings, label_list_train)
dev_dataset = FeverDataset(dev_encodings, label_list_dev)

In [86]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
#model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [87]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [88]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

optim = AdamW(model.parameters(), lr=2e-5)

In [89]:
start = time.time()
print ('start training...................................................')

for epoch in range(1):
    print("epoch")
    for batch in train_loader:
        print("batch")
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
end = time.time()
time_to_train = end-start
print('training time: ', time_to_train, 'seconds')

start training...................................................
epoch
batch


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


training time:  26.732743978500366 seconds


In [90]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [91]:
inputs = dev_encodings

In [92]:
outputs = model(**inputs)

In [93]:
dev_classification_logits = outputs.logits.detach().numpy()
dev_classification_logits

array([[-0.80661833,  0.2533857 ,  0.37073523],
       [-0.8048866 ,  0.2983208 ,  0.39062566],
       [-0.8130149 ,  0.281491  ,  0.38211703],
       [-0.8102648 ,  0.28125244,  0.3876003 ],
       [-0.60681576,  0.16499577,  0.2691391 ]], dtype=float32)

In [95]:
predictions = np.argmax(dev_classification_logits, axis=1)
predictions

array([2, 2, 2, 2, 2])

# compare the dev results

In [96]:
actual_output = []

with open('actual_output/shared_task_dev.jsonl') as output:
    
    for step, line in enumerate(output):
        instance = json.loads(line.strip())
        actual_output.append(instance)

In [97]:
dev_bert = []

with open('output_claim-verification/bert/claims.predicted.dev.jsonl') as output:
    
    for step, line in enumerate(output):
        instance = json.loads(line.strip())
        dev_bert.append(instance)

In [98]:
dev_albert = []

with open('output_claim-verification/albert/claims.predicted.dev.jsonl') as output:
    
    for step, line in enumerate(output):
        instance = json.loads(line.strip())
        dev_albert.append(instance)

In [100]:
tab = PrettyTable()
tab.field_names = ["Model Name", "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1"]

In [101]:
score,acc,precision,recall,f1 = fever_score(dev_bert, actual_output)

In [102]:
tab.add_row(('BERT', round(score,4),round(acc,4),round(precision,4),round(recall,4),round(f1,4)))

In [103]:
score,acc,precision,recall,f1 = fever_score(dev_albert, actual_output)

In [104]:
tab.add_row(('ALBERT', round(score,4),round(acc,4),round(precision,4),round(recall,4),round(f1,4)))

In [105]:
print(tab)

+------------+-------------+----------------+--------------------+-----------------+-------------+
| Model Name | FEVER Score | Label Accuracy | Evidence Precision | Evidence Recall | Evidence F1 |
+------------+-------------+----------------+--------------------+-----------------+-------------+
|    BERT    |    0.6898   |     0.7407     |       0.8931       |      0.7045     |    0.7877   |
|   ALBERT   |    0.6054   |     0.6564     |       0.859        |      0.6072     |    0.7115   |
+------------+-------------+----------------+--------------------+-----------------+-------------+
