In [1]:
import os

# from main import FeverLoader, PubhealthLoader, ClimateFeverLoader
from main import load_datasets

  from .autonotebook import tqdm as notebook_tqdm


## Setting
- 1: train on FEVER
- 2: train on pubhealth
- 3: train on climate

In [2]:
experiment = "3"
model_dirs = {
    "1": "../models/BERT_FEVER",
    "2": "../models/BERT_PUBHEALTH",
    "3": "../models/BERT_CLIMATE"
}

## Load Data
- ds1: train on FEVER
- ds2: train on pubhealth
- ds3: train on climate

In [3]:
root = '../data_2023_06_02'

fever_dir = os.path.join(root, 'preprocessed/FEVER')
pubhealth_dir = os.path.join(root, 'preprocessed/PUBHEALTH')
climate_dir = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

In [4]:
ds1, ds2, ds3, ds_test = load_datasets(fever_dir, pubhealth_dir, climate_dir)

## Training

In [5]:
model_name = "bert-base-uncased" #https://huggingface.co/bert-base-uncased
# model_name = "bert-large-uncased" #https://huggingface.co/bert-large-uncased
# model_name = "allenai/scibert_scivocab_uncased" #https://huggingface.co/allenai/scibert_scivocab_uncased

#### Tokenize data

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(samples):
    return tokenizer(samples['claim'], samples['evidence'], 
                     padding=True,
                     truncation='only_second', 
                     max_length=512)

In [7]:
if experiment == "1":
    ds = ds1
elif experiment == "2":
    ds = ds2
elif experiment == "3":
    ds = ds3
else:
    raise ValueError("Unknown Experiment")

In [8]:
encoded_ds = ds.map(preprocess_function, batched=True)

                                                                 

In [9]:
print(encoded_ds)

DatasetDict({
    train: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 981
    })
    validation: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
    fever_test: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9999
    })
    pubhealth_test: Dataset({
        features: ['claim', 'evidence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1032
    })
    climate_test: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})


#### Setup model

In [10]:
from transformers import AutoModelForSequenceClassification

In [11]:
num_labels = 3 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

#### Setup trainer

In [12]:
import numpy as np
import evaluate 
from transformers import TrainingArguments, Trainer

In [13]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

In [14]:
batch_size = 8 
num_epochs = 10

model_dir = model_dirs.get(experiment)
print(model_dir)

../models/BERT_CLIMATE


In [15]:
args = TrainingArguments(
    model_dir,
    evaluation_strategy = "epoch",
    # evaluation_strategy = "steps",
    save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    load_best_model_at_end = True,
    logging_strategy = "epoch"
    # metric_for_best_model = "accuracy"
)   

In [16]:
trainer = Trainer(
    model, 
    args,
    train_dataset = encoded_ds["train"],
    eval_dataset = encoded_ds["validation"],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

#### Train model

In [17]:
import torch

In [18]:
torch.cuda.empty_cache()

In [19]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 981
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 310
  Number of trainable parameters = 109484547
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0178,0.991931,0.54
2,0.8414,0.977253,0.565
3,0.5355,1.274049,0.55
4,0.3175,1.332595,0.52
5,0.1693,1.581463,0.525
6,0.0636,1.933583,0.56
7,0.0329,2.109499,0.525
8,0.0144,2.298011,0.52
9,0.0037,2.293919,0.55
10,0.0029,2.356198,0.53


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to ../models/BERT_CLIMATE/checkpoint-31
Configuration saved in ../models/BERT_CLIMATE/checkpoint-31/config.json
Model weights saved in ../models/BERT_CLIMATE/checkpoint-31/pytorch_model.bin
tokenizer config file saved in ../models/BERT_CLIMATE/checkpoint-31/tokenizer_config.json
Special tokens file saved in ../models/BERT_CLIMATE/checkpoint-31/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safe

TrainOutput(global_step=310, training_loss=0.2998966959695662, metrics={'train_runtime': 109.0549, 'train_samples_per_second': 89.955, 'train_steps_per_second': 2.843, 'total_flos': 2581142627911680.0, 'train_loss': 0.2998966959695662, 'epoch': 10.0})

In [20]:
trainer.evaluate(encoded_ds['fever_test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9999
  Batch size = 32


{'eval_loss': 1.009195327758789,
 'eval_accuracy': 0.6060606060606061,
 'eval_runtime': 33.6789,
 'eval_samples_per_second': 296.892,
 'eval_steps_per_second': 9.294,
 'epoch': 10.0}

In [21]:
trainer.evaluate(encoded_ds['pubhealth_test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1032
  Batch size = 32


{'eval_loss': 1.2854571342468262,
 'eval_accuracy': 0.3953488372093023,
 'eval_runtime': 3.5537,
 'eval_samples_per_second': 290.404,
 'eval_steps_per_second': 9.286,
 'epoch': 10.0}

In [22]:
trainer.evaluate(encoded_ds['climate_test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32


{'eval_loss': 0.9528858661651611,
 'eval_accuracy': 0.575,
 'eval_runtime': 0.6846,
 'eval_samples_per_second': 292.16,
 'eval_steps_per_second': 10.226,
 'epoch': 10.0}