In [46]:
import os

from main import FeverLoader, PubhealthLoader, ClimateFeverLoader

## Setting
- 1: train on FEVER
- 2: train on pubhealth
- 3: train on climate

In [47]:
experiment = "3"
model_dirs = {
    "1": "../models/RoBERTa_FEVER",
    "2": "../models/RoBERTa_PUBHEALTH",
    "3": "../models/RoBERTa_CLIMATE"
}

## Load Data

In [48]:
root = '../data_2023_06_02'

climate_in = os.path.join(root, 'preprocessed/CLIMATE-FEVER')
pubhealth_in = os.path.join(root, 'preprocessed/PUBHEALTH')
fever_in = os.path.join(root, 'preprocessed/FEVER')

In [52]:
fever_train_ds, fever_dev_ds, fever_test_ds = FeverLoader.load(fever_in)
print(len(fever_train_ds))
print(len(fever_dev_ds))
print(len(fever_test_ds))

145449
9999
9999


In [53]:
pubhealth_train_ds, pubhealth_dev_ds, pubhealth_test_ds = PubhealthLoader.load(pubhealth_in)
print(len(pubhealth_train_ds))
print(len(pubhealth_dev_ds))
print(len(pubhealth_test_ds))

8370
1050
1032


In [51]:
climate_ds = ClimateFeverLoader.load(climate_in)
print(len(climate_ds))

1381


## Training

In [7]:
# model_name = "bert-base-uncased" #https://huggingface.co/bert-base-uncased
# model_name = "bert-large-uncased" #https://huggingface.co/bert-large-uncased
# model_name = "allenai/scibert_scivocab_uncased" #https://huggingface.co/allenai/scibert_scivocab_uncased
model_name = "roberta-base" #https://huggingface.co/roberta-base

#### Set up training data
- ds1: train on FEVER
- ds2: train on pubhealth
- ds3: train on climate

In [8]:
from datasets import Dataset, DatasetDict, ClassLabel, Value, Features

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
features = Features({
    "claim": Value("string"), 
    "evidence": Value("string"),
    "label": ClassLabel(num_classes=3, names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"])
})

In [10]:
# train on fever 
ds1 = DatasetDict()
ds1['train'] = Dataset.from_list(fever_train_ds, features=features)
ds1['validation'] = Dataset.from_list(fever_dev_ds, features=features)
ds1['fever_test'] = Dataset.from_list(fever_test_ds, features=features)
ds1['pubhealth_test'] = Dataset.from_list(pubhealth_test_ds, features=features)
ds1['climate_test']  = Dataset.from_list(climate_ds, features=features)

In [11]:
# train on pubhealth
ds2 = DatasetDict()
ds2['train'] = Dataset.from_list(pubhealth_train_ds, features=features)
ds2['validation'] = Dataset.from_list(pubhealth_dev_ds, features=features)
ds2['fever_test'] = Dataset.from_list(fever_test_ds, features=features)
ds2['pubhealth_test'] = Dataset.from_list(pubhealth_test_ds, features=features)
ds2['climate_test']  = Dataset.from_list(climate_ds, features=features)

In [12]:
# train on climate
ds3 = DatasetDict()

#split climate data
from sklearn.model_selection import train_test_split

climate_train_ds, climate_dev_ds = train_test_split(climate_ds, random_state=392)
print(len(climate_train_ds))
print(len(climate_dev_ds))


ds3['train'] = Dataset.from_list(climate_train_ds, features=features)
ds3['validation'] = Dataset.from_list(climate_dev_ds, features=features)
ds3['fever_test'] = Dataset.from_list(fever_test_ds, features=features)
ds3['pubhealth_test'] = Dataset.from_list(pubhealth_test_ds, features=features)

1035
346


#### Tokenize data

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(samples):
    return tokenizer(samples['claim'], samples['evidence'], 
                     padding=True,
                     truncation='only_second', 
                     max_length=512)

In [14]:
if experiment == "1":
    ds = ds1
elif experiment == "2":
    ds = ds2
elif experiment == "3":
    ds = ds3
else:
    raise ValueError("Unknown Experiment")

In [15]:
encoded_ds = ds.map(preprocess_function, batched=True)

                                                                 

In [16]:
print(encoded_ds)

DatasetDict({
    train: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'attention_mask'],
        num_rows: 1035
    })
    validation: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'attention_mask'],
        num_rows: 346
    })
    fever_test: Dataset({
        features: ['claim', 'label', 'evidence', 'input_ids', 'attention_mask'],
        num_rows: 9999
    })
    pubhealth_test: Dataset({
        features: ['claim', 'evidence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1032
    })
})


#### Setup model

In [17]:
from transformers import AutoModelForSequenceClassification

In [36]:
num_labels = 3 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print(model)

loading configuration file config.json from cache at /users/k21193529/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

#### Setup trainer

In [19]:
import numpy as np
import evaluate 
from transformers import TrainingArguments, Trainer

In [37]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

In [38]:
batch_size = 8 #defaults to 8
num_epochs = 5

model_dir = model_dirs.get(experiment)
print(model_dir)

../models/RoBERTa_CLIMATE


In [39]:
args = TrainingArguments(
    model_dir,
    evaluation_strategy = "epoch",
    # evaluation_strategy = "steps",
    save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    load_best_model_at_end = True,
    logging_strategy = "epoch",
    # metric_for_best_model = "accuracy"
)   

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [40]:
trainer = Trainer(
    model, 
    args,
    train_dataset = encoded_ds["train"],
    eval_dataset = encoded_ds["validation"],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

#### Train model

In [24]:
import torch

In [41]:
torch.cuda.empty_cache()

In [42]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1035
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 165
  Number of trainable parameters = 124647939


Epoch,Training Loss,Validation Loss,Accuracy
1,1.035,1.057463,0.488439
2,0.999,1.022895,0.508671
3,0.9133,0.938152,0.517341
4,0.7485,1.025777,0.560694
5,0.609,1.083432,0.543353


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 346
  Batch size = 32
Saving model checkpoint to ../models/RoBERTa_CLIMATE/checkpoint-33
Configuration saved in ../models/RoBERTa_CLIMATE/checkpoint-33/config.json
Model weights saved in ../models/RoBERTa_CLIMATE/checkpoint-33/pytorch_model.bin
tokenizer config file saved in ../models/RoBERTa_CLIMATE/checkpoint-33/tokenizer_config.json
Special tokens file saved in ../models/RoBERTa_CLIMATE/checkpoint-33/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `RobertaForSequenceClassificat

TrainOutput(global_step=165, training_loss=0.860944042783795, metrics={'train_runtime': 52.967, 'train_samples_per_second': 97.702, 'train_steps_per_second': 3.115, 'total_flos': 1361611936742400.0, 'train_loss': 0.860944042783795, 'epoch': 5.0})

In [43]:
trainer.evaluate(encoded_ds['fever_test'])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9999
  Batch size = 32


{'eval_loss': 1.2547426223754883,
 'eval_accuracy': 0.34293429342934295,
 'eval_runtime': 32.213,
 'eval_samples_per_second': 310.402,
 'eval_steps_per_second': 9.717,
 'epoch': 5.0}

In [44]:
trainer.evaluate(encoded_ds['pubhealth_test'])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1032
  Batch size = 32


{'eval_loss': 1.3859279155731201,
 'eval_accuracy': 0.14534883720930233,
 'eval_runtime': 3.3218,
 'eval_samples_per_second': 310.676,
 'eval_steps_per_second': 9.934,
 'epoch': 5.0}

In [45]:
# trainer.evaluate(encoded_ds['climate_test'])
trainer.evaluate(encoded_ds['validation'])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: evidence, claim. If evidence, claim are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 346
  Batch size = 32


{'eval_loss': 0.9381524920463562,
 'eval_accuracy': 0.5173410404624278,
 'eval_runtime': 1.1324,
 'eval_samples_per_second': 305.543,
 'eval_steps_per_second': 9.714,
 'epoch': 5.0}

## Inference

In [None]:
# load best model
best_model_checkpoint = "../models/RoBERTa_FEVER/checkpoint-2273"
best_model_checkpoint = "../models/RoBERTa_PUBHEALTH/checkpoint-262"
best_model_checkpoint = "../models/RoBERTa_CLIMATE/checkpoint-99"

In [None]:
num_labels = 3 
model = AutoModelForSequenceClassification.from_pretrained(best_model_checkpoint, num_labels=num_labels)

In [None]:
import numpy as np
from transformers import TrainingArguments, Trainer
import evaluate 

batch_size = 16 #defaults to 8
num_epochs = 5

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

args = TrainingArguments(
    "../models/SciBERT_FEVER",
    evaluation_strategy = "epoch",
    # evaluation_strategy = "steps",
    save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    load_best_model_at_end = True,
    # metric_for_best_model = "accuracy"
)   

trainer = Trainer(
    model, 
    args,
    # train_dataset = encoded_ds["train"],
    eval_dataset = encoded_ds["validation"],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

In [None]:
trainer.evaluate(encoded_ds['fever_test'])

In [None]:
trainer.evaluate(encoded_ds['pubhealth_test'])

In [None]:
trainer.evaluate(encoded_ds['climate'])