In [1]:
import os

from main import FeverLoader, PubhealthLoader, ClimateFeverLoader

## Load Data

In [2]:
root = '../data_2023_06_02'

climate_in = os.path.join(root, 'preprocessed/CLIMATE-FEVER')
pubhealth_in = os.path.join(root, 'preprocessed/PUBHEALTH')
fever_in = os.path.join(root, 'preprocessed/FEVER')

In [3]:
fever_train_ds, fever_dev_ds, fever_test_ds = FeverLoader.load(fever_in)
print(len(fever_test_ds))
print(len(fever_dev_ds))
print(len(fever_train_ds))

9999
9999
145449


In [4]:
pubhealth_train_ds, pubhealth_dev_ds, pubhealth_test_ds = PubhealthLoader.load(pubhealth_in)
print(len(pubhealth_train_ds))
print(len(pubhealth_dev_ds))
print(len(pubhealth_test_ds))

8370
1050
1032


In [5]:
climate_ds = ClimateFeverLoader.load(climate_in)
print(len(climate_ds))

1381


## Training

In [11]:
model_name = "bert-base-uncased" #https://huggingface.co/bert-base-uncased
# model = "bert-large-uncased" #https://huggingface.co/bert-large-uncased

#### Set up training data
- combine fever and pubhealth train

In [6]:
from datasets import Dataset, DatasetDict, ClassLabel, Value, Features

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
ds = DatasetDict()
features = Features({
    "claim": Value("string"), 
    "evidence": Value("string"),
    "label": ClassLabel(num_classes=3, names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"])
})

In [23]:
ds['train'] = Dataset.from_list(fever_train_ds + pubhealth_train_ds, features=features)
ds['validation'] = Dataset.from_list(fever_dev_ds + pubhealth_dev_ds, features=features)
ds['fever_test'] = Dataset.from_list(fever_test_ds, features=features)
ds['pubhealth_test'] = Dataset.from_list(pubhealth_test_ds, features=features)
ds['climate']  = Dataset.from_list(climate_ds, features=features)

#### Tokenize data

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(samples):
    return tokenizer(samples['claim'], samples['evidence'], 
                     padding=True,
                     truncation='only_second')

In [24]:
encoded_ds = ds.map(preprocess_function, batched=True)

                                                                     

#### Setup model

In [14]:
from transformers import AutoModelForSequenceClassification

In [15]:
num_labels = 3 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# print(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Setup trainer

In [16]:
import numpy as np
import evaluate 
from transformers import TrainingArguments, Trainer

In [17]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

In [18]:
batch_size = 16 #defaults to 8
num_epochs = 5

args = TrainingArguments(
    "../models/FEVER_BERT_FEVER_PUB",
    evaluation_strategy = "epoch",
    # evaluation_strategy = "steps",
    save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    load_best_model_at_end = True,
    # metric_for_best_model = "accuracy"
)   

In [19]:
trainer = Trainer(
    model, 
    args,
    train_dataset = encoded_ds["train"],
    eval_dataset = encoded_ds["validation"],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

#### Train model

In [20]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 153819
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 12020
  Number of trainable parameters = 109484547
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1626,0.168637,0.93737
2,0.1134,0.206743,0.938275
3,0.0766,0.205594,0.943705
4,0.0468,0.229026,0.942438
5,0.027,0.290554,0.944339


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11049
  Batch size = 64
Saving model checkpoint to ../models/FEVER_BERT_FEVER_PUB/checkpoint-2404
Configuration saved in ../models/FEVER_BERT_FEVER_PUB/checkpoint-2404/config.json
Model weights saved in ../models/FEVER_BERT_FEVER_PUB/checkpoint-2404/pytorch_model.bin
tokenizer config file saved in ../models/FEVER_BERT_FEVER_PUB/checkpoint-2404/tokenizer_config.json
Special tokens file saved in ../models/FEVER_BERT_FEVER_PUB/checkpoint-2404/special_tokens_map.json
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_

TrainOutput(global_step=12020, training_loss=0.09159879929908302, metrics={'train_runtime': 5397.5173, 'train_samples_per_second': 142.491, 'train_steps_per_second': 2.227, 'total_flos': 2.0235921400751616e+17, 'train_loss': 0.09159879929908302, 'epoch': 5.0})

In [25]:
trainer.evaluate(encoded_ds['fever_test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9999
  Batch size = 64


{'eval_loss': 0.174705371260643,
 'eval_accuracy': 0.9381938193819382,
 'eval_runtime': 27.784,
 'eval_samples_per_second': 359.883,
 'eval_steps_per_second': 5.651,
 'epoch': 5.0}

In [26]:
trainer.evaluate(encoded_ds['pubhealth_test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1032
  Batch size = 64


{'eval_loss': 0.4828401803970337,
 'eval_accuracy': 0.7965116279069767,
 'eval_runtime': 2.8988,
 'eval_samples_per_second': 356.013,
 'eval_steps_per_second': 5.865,
 'epoch': 5.0}

In [27]:
trainer.evaluate(encoded_ds['climate'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1381
  Batch size = 64


{'eval_loss': 2.2618136405944824,
 'eval_accuracy': 0.43084721216509775,
 'eval_runtime': 3.983,
 'eval_samples_per_second': 346.724,
 'eval_steps_per_second': 5.523,
 'epoch': 5.0}

## Inference

In [None]:
# load best model
best_model_checkpoint = "../models/FEVER_BERT_FEVER_PUB/checkpoint-2404"