In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### About
- Train BERT model on FEVER dataset to beat paper RTE accuracy of 88% (DA), 73.81% (MLP) 

#### References
- Hugging face text classification [tutorial](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)


## Load Data

In [2]:
import pandas as pd
import json
import csv
import os

In [3]:
root = '../data_2023_06_02'

climate_in = os.path.join(root, 'preprocessed/CLIMATE-FEVER')
pubhealth_in = os.path.join(root, 'preprocessed/PUBHEALTH')
fever_in = os.path.join(root, 'preprocessed/FEVER')

In [4]:
def read_json(fp):
    with open(fp, "r", encoding="utf-8") as f:
        data = []
        for line in f.readlines():
            data.append(json.loads(line.strip()))
        return data

#### Load Fever Data

In [5]:
fever_test_ds = read_json(os.path.join(fever_in, 'test_preprocessed.ns.rand.jsonl'))
fever_dev_ds = read_json(os.path.join(fever_in, 'dev_preprocessed.ns.rand.jsonl'))
fever_train_ds = read_json(os.path.join(fever_in, 'train_preprocessed.ns.rand.jsonl'))

print(len(fever_test_ds))
print(len(fever_dev_ds))
print(len(fever_train_ds))

9999
9999
145449


#### Load PubHealth Data

In [19]:
pubhealth_train_ds = read_json(os.path.join(pubhealth_in, 'train.jsonl'))
pubhealth_dev_ds = read_json(os.path.join(pubhealth_in, 'dev.jsonl'))
pubhealth_test_ds = read_json(os.path.join(pubhealth_in, 'test.jsonl'))

print(len(pubhealth_train_ds))
print(len(pubhealth_dev_ds))
print(len(pubhealth_test_ds))

9806
1217
1235


#### Load Climate Fever

In [7]:
climate_ds = read_json(os.path.join(climate_in, 'climate-fever.jsonl'))

In [8]:
print(len(climate_ds))

1381


#### Process data

In [9]:
def process_fever(sample):
    #concatenate evidence_text
    obj = {}
    obj['claim'] = sample['claim']
    obj['label'] = sample['label']
    obj['evidence'] = " ".join(sample['evidence_text'])
    return obj

fever_test_ds = list(map(process_fever, fever_test_ds))
fever_dev_ds = list(map(process_fever, fever_dev_ds))
fever_train_ds = list(map(process_fever, fever_train_ds))

In [20]:
def filter_pubhealth(sample):
    return sample['label'] in ['true', 'false', 'unproven']

pubhealth_train_ds = list(filter(filter_pubhealth, pubhealth_train_ds))
pubhealth_dev_ds = list(filter(filter_pubhealth, pubhealth_dev_ds))
pubhealth_test_ds = list(filter(filter_pubhealth, pubhealth_test_ds))

print(len(pubhealth_train_ds))
print(len(pubhealth_dev_ds))
print(len(pubhealth_test_ds))

def process_pubhealth(sample):
    obj = {}
    obj['claim'] = sample['claim']
    obj['evidence'] = " ".join(sample['top_k'])
    
    # modify label
    label = sample["label"]
    if label == 'true':
        obj["label"] = "SUPPORTS"
    elif label == 'false':
        obj["label"] = "REFUTES"
    else:
        obj["label"] = "NOT ENOUGH INFO"
    
    return obj

#process pubhealth
pubhealth_train_ds = list(map(process_pubhealth, pubhealth_train_ds))
pubhealth_dev_ds = list(map(process_pubhealth, pubhealth_dev_ds))
pubhealth_test_ds = list(map(process_pubhealth, pubhealth_test_ds))

8370
1050
1032


In [100]:
def process_climate(sample):
    obj = {}
    obj['claim'] = sample['claim']

    #modify label
    label = sample['claim_label']
    if label == "NOT_ENOUGH_INFO":
        label = "NOT ENOUGH INFO"
    obj['label'] = label

    #concatenate evidence
    obj['evidence'] = " ".join([e['evidence'] for e in sample['evidences']])

    return obj

#process climate
climate_ds = list(map(process_climate, climate_ds))

#### Create dataset object

In [79]:
from datasets import Dataset, DatasetDict, ClassLabel, Value, Features

In [80]:
ds = DatasetDict()
features = Features({
    "claim": Value("string"), 
    "evidence": Value("string"),
    "label": ClassLabel(num_classes=3, names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"])
})

In [21]:
ds['train'] = Dataset.from_list(fever_train_ds, features=features)
ds['validation'] = Dataset.from_list(fever_dev_ds, features=features)
ds['test'] = Dataset.from_list(fever_test_ds, features=features)

## Setup Hugging Face

In [14]:
model_name = "bert-base-uncased" #https://huggingface.co/bert-base-uncased
# model = "bert-large-uncased" #https://huggingface.co/bert-large-uncased

#### Tokenize data

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
def preprocess_function(samples):
    return tokenizer(samples['claim'], samples['evidence'], 
                     padding=True,
                     truncation='only_second')

In [17]:
encoded_ds = ds.map(preprocess_function, batched=True)

                                                                     

#### Setup model

In [18]:
from transformers import AutoModelForSequenceClassification

In [19]:
num_labels = 3 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

#### Setup trainer

In [21]:
import numpy as np
import evaluate 
from transformers import TrainingArguments, Trainer

In [22]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

In [23]:
batch_size = 16 #defaults to 8
num_epochs = 5

args = TrainingArguments(
    "../models/FEVER_BERT_V1",
    evaluation_strategy = "epoch",
    # evaluation_strategy = "steps",
    save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    load_best_model_at_end = True,
    # metric_for_best_model = "accuracy"
)   

In [24]:
trainer = Trainer(
    model, 
    args,
    train_dataset = encoded_ds["train"],
    eval_dataset = encoded_ds["validation"],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

In [25]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, evidence. If claim, evidence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 145449
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 11365
  Number of trainable parameters = 109484547
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.141,0.142584,0.950695
2,0.0925,0.142349,0.957596
3,0.0606,0.158913,0.958696
4,0.0407,0.202998,0.959096
5,0.0232,0.213202,0.960596


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, evidence. If claim, evidence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9999
  Batch size = 64
Saving model checkpoint to ../models/FEVER_BERT_V1/checkpoint-2273
Configuration saved in ../models/FEVER_BERT_V1/checkpoint-2273/config.json
Model weights saved in ../models/FEVER_BERT_V1/checkpoint-2273/pytorch_model.bin
tokenizer config file saved in ../models/FEVER_BERT_V1/checkpoint-2273/tokenizer_config.json
Special tokens file saved in ../models/FEVER_BERT_V1/checkpoint-2273/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, evidence. If claim, evidence are not expected by `BertForSequenceClassification.forward

TrainOutput(global_step=11365, training_loss=0.07854350635882679, metrics={'train_runtime': 5030.4865, 'train_samples_per_second': 144.568, 'train_steps_per_second': 2.259, 'total_flos': 1.9134791747559936e+17, 'train_loss': 0.07854350635882679, 'epoch': 5.0})

In [27]:
trainer.evaluate(eval_dataset = encoded_ds["validation"])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, evidence. If claim, evidence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9999
  Batch size = 64


{'eval_loss': 0.14234882593154907,
 'eval_accuracy': 0.9575957595759576,
 'eval_runtime': 26.858,
 'eval_samples_per_second': 372.291,
 'eval_steps_per_second': 5.846,
 'epoch': 5.0}

In [28]:
trainer.evaluate(eval_dataset = encoded_ds["test"])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, evidence. If claim, evidence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9999
  Batch size = 64


{'eval_loss': 0.18636560440063477,
 'eval_accuracy': 0.9455945594559456,
 'eval_runtime': 26.9831,
 'eval_samples_per_second': 370.566,
 'eval_steps_per_second': 5.818,
 'epoch': 5.0}

## Inference

#### Prepare dataset

In [101]:
# load best model
best_model_checkpoint = "../models/FEVER_BERT_V1/checkpoint-4546"

In [102]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(best_model_checkpoint)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [103]:
def preprocess_function(samples):
    return tokenizer(samples['claim'], samples['evidence'], 
                     padding=True,
                     truncation='only_second')

In [104]:
# encoded_ds = ds.map(preprocess_function, batched=True)
ds = DatasetDict()
features = Features({
    "claim": Value("string"), 
    "evidence": Value("string"),
    "label": ClassLabel(num_classes=3, names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"])
})

# pubhealth
# ds['validation'] = Dataset.from_list(pubhealth_dev_ds, features=features)
# ds['test'] = Dataset.from_list(pubhealth_test_ds, features=features)

# climate
ds['test'] = Dataset.from_list(climate_ds, features=features)

encoded_ds = ds.map(preprocess_function, batched=True)

                                                                 

In [27]:
from transformers import AutoModelForSequenceClassification

In [105]:
num_labels = 3 
model = AutoModelForSequenceClassification.from_pretrained(best_model_checkpoint, num_labels=num_labels)

loading configuration file ../models/FEVER_BERT_V1/checkpoint-4546/config.json
Model config BertConfig {
  "_name_or_path": "../models/FEVER_BERT_V1/checkpoint-4546",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30

In [107]:
import numpy as np
from transformers import TrainingArguments, Trainer
import evaluate 

batch_size = 16 #defaults to 8
num_epochs = 5

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

args = TrainingArguments(
    "../models/FEVER_BERT_V1",
    evaluation_strategy = "epoch",
    # evaluation_strategy = "steps",
    save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    load_best_model_at_end = True,
    # metric_for_best_model = "accuracy"
)  
trainer = Trainer(
    model, 
    args,
    # train_dataset = encoded_ds["train"],
    eval_dataset = encoded_ds["test"],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [108]:
trainer.evaluate(encoded_ds['test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1381
  Batch size = 64
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.541452169418335,
 'eval_accuracy': 0.4207096307023896,
 'eval_runtime': 3.731,
 'eval_samples_per_second': 370.144,
 'eval_steps_per_second': 5.897}