# Finetuning a BERT Classifier using the Transformers Trainer API | Bachelor's Project
This file covers how we fine-tuned NB-BERT-large for the task of discriminating between real and synthetic news.

October-November 2022. Mina Almasi & Anton Drasbæk

## SETUP

In [None]:
!pip install -q transformers pandas datasets sklearn evaluate transformers_interpret wandb

In [None]:
!nvidia-smi

Fri Nov 18 11:21:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    50W / 400W |  28668MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import wandb
wandb.login()

True

## Process data

Import data: 

In [None]:
#mount google drive (if run from google colab)
from google.colab import drive
drive.mount("/content/drive")

MessageError: ignored

In [None]:
import pandas as pd
# import data
data = pd.read_csv("/content/drive/MyDrive/000 bachelor-project/data/labelled_data_for_classifier.csv")
test_data = pd.read_csv("/content/drive/MyDrive/000 bachelor-project/data/test_data_classifier.csv")

#select relevant columns
test_data = test_data[["text", "label"]]

In [None]:
#make sure it is in right format
data['label'] = data['label'].astype(int)
data['text'] = data['text'].astype(str)

test_data['label'] = test_data['label'].astype(int)
test_data['text'] = test_data['text'].astype(str)

In [None]:
#split up data into train and eval
from sklearn.model_selection import train_test_split

train_data, eval_data = train_test_split(data)

Put dataset in dictionary format for huggingface training: 

In [None]:
import datasets
import pandas as pd 
from datasets import Dataset

#convert pandas dataframes into datasets for huggingface using the datasets package
train_dataset = Dataset.from_pandas(train_data, preserve_index = False) #removing past indices with preserve_index = False
eval_dataset = Dataset.from_pandas(eval_data, preserve_index = False)
test_dataset = Dataset.from_pandas(test_data, preserve_index = False)

In [None]:
#Combine all three datasets into one dataset dict
dataset = datasets.DatasetDict({"train":train_dataset,"eval":eval_dataset, "test":test_dataset})
dataset

## Tokenization, Choosing model, Evaluation Function

In [None]:
import torch 
from transformers import AdamW, AutoTokenizer

In [None]:
#define model type
model_name = "NbAiLab/nb-bert-large"

#import tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name)

#tokenize function from huggingface
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

#tokenization with datasets map function to apply the tokenize function to enitre dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True) #batched = True to process mulitple elements at once

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--NbAiLab--nb-bert-large/snapshots/27e8180855f0de03688958c88a2e5702bfbf0bfd/config.json
Model config BertConfig {
  "_name_or_path": "NbAiLab/nb-bert-large",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--NbAiLab--nb-bert-large/snapshots/27e8180855f0de03688958c88a2e5702bfbf0bfd/vocab.txt
loading file tokenizer.json from cache at None
l

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 913
    })
    eval: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 305
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 96
    })
})

In [None]:
#define datacollator (creates batch of examples later in training) with padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Evaluation function (for trainer)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import EarlyStoppingCallback 
early_stop = EarlyStoppingCallback(early_stopping_patience = 3) #if validation set does not improve after the N epochs, stop training

## Training

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

#import model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) #number of labels for classification defined

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--NbAiLab--nb-bert-large/snapshots/27e8180855f0de03688958c88a2e5702bfbf0bfd/config.json
Model config BertConfig {
  "_name_or_path": "NbAiLab/nb-bert-large",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--NbAiLab--nb-bert-large/snapshots/27e8180855f0de03688958c88a2e5702bfbf0bfd/pytorch_model.bin
Some weights of the mode

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#define batchsize
batch_size = 24

#define training arguments
training_args = TrainingArguments(
    output_dir="./finetuned-nb-bert-large-8",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01, 
    report_to = "wandb", 
    load_best_model_at_end = True,
    push_to_hub=True, 
    metric_for_best_model = "accuracy"
)

# finish wandb
wandb.finish()

PyTorch: setting up devices


In [None]:
# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [early_stop]
)

# initiate trainer
trainer.train()

# finish wandb
wandb.finish()

/content/./finetuned-nb-bert-large-8 is already a clone of https://huggingface.co/MinaAlmasi/finetuned-nb-bert-large-8. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 913
  Num Epochs = 15
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 585
  Number of trainable parameters = 355089410
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.696,0.492585,0.82623,0.821127,0.867197,0.82623
2,0.4195,0.18038,0.957377,0.957374,0.957557,0.957377
3,0.1458,0.281003,0.92459,0.92414,0.934445,0.92459
4,0.0424,0.589323,0.885246,0.883844,0.904115,0.885246
5,0.0246,1.477563,0.747541,0.730101,0.83206,0.747541


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 305
  Batch size = 24
Saving model checkpoint to ./finetuned-nb-bert-large-8/checkpoint-39
Configuration saved in ./finetuned-nb-bert-large-8/checkpoint-39/config.json
Model weights saved in ./finetuned-nb-bert-large-8/checkpoint-39/pytorch_model.bin
tokenizer config file saved in ./finetuned-nb-bert-large-8/checkpoint-39/tokenizer_config.json
Special tokens file saved in ./finetuned-nb-bert-large-8/checkpoint-39/special_tokens_map.json
tokenizer config file saved in ./finetuned-nb-bert-large-8/tokenizer_config.json
Special tokens file saved in ./finetuned-nb-bert-large-8/special_tokens_map.json
Several commits (2) will be pushed upstream.
The following columns in the evaluation set do

VBox(children=(Label(value='0.001 MB of 0.022 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.033528…

0,1
eval/accuracy,▄█▇▆▁
eval/f1,▄█▇▆▁
eval/loss,▃▁▂▃█
eval/precision,▃█▇▅▁
eval/recall,▄█▇▆▁
eval/runtime,█▁▂▇▃
eval/samples_per_second,▁█▇▂▆
eval/steps_per_second,▁█▇▂▆
train/epoch,▁▁▃▃▅▅▆▆███
train/global_step,▁▁▃▃▅▅▆▆███

0,1
eval/accuracy,0.74754
eval/f1,0.7301
eval/loss,1.47756
eval/precision,0.83206
eval/recall,0.74754
eval/runtime,3.1107
eval/samples_per_second,98.049
eval/steps_per_second,4.179
train/epoch,5.0
train/global_step,195.0


In [None]:
#trainer.evaluate()

In [None]:
trainer.create_model_card()

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7475409836065574}, {'name': 'F1', 'type': 'f1', 'value': 0.7301007787267076}, {'name': 'Precision', 'type': 'precision', 'value': 0.8320598717034926}, {'name': 'Recall', 'type': 'recall', 'value': 0.7475409836065574}]}


## Predict on Test Data

In [None]:
predictions = trainer.predict(tokenized_dataset["test"])

print(predictions[2])
print(np.argmax(predictions.predictions, axis=-1))

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 96
  Batch size = 24


{'test_loss': 0.25739938020706177, 'test_accuracy': 0.9270833333333334, 'test_f1': 0.9268849961919269, 'test_precision': 0.93176831943835, 'test_recall': 0.9270833333333334, 'test_runtime': 1.0255, 'test_samples_per_second': 93.612, 'test_steps_per_second': 3.901}
[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0
 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0]


## Save & Push Model To Hub


In [None]:
trainer.save_model("./finetuned_nb_bert_18nov")

Saving model checkpoint to ./finetuned_nb_bert_18nov
Configuration saved in ./finetuned_nb_bert_18nov/config.json
Model weights saved in ./finetuned_nb_bert_18nov/pytorch_model.bin
tokenizer config file saved in ./finetuned_nb_bert_18nov/tokenizer_config.json
Special tokens file saved in ./finetuned_nb_bert_18nov/special_tokens_map.json
Saving model checkpoint to ./finetuned-nb-bert-large-8
Configuration saved in ./finetuned-nb-bert-large-8/config.json
Model weights saved in ./finetuned-nb-bert-large-8/pytorch_model.bin
tokenizer config file saved in ./finetuned-nb-bert-large-8/tokenizer_config.json
Special tokens file saved in ./finetuned-nb-bert-large-8/special_tokens_map.json
Several commits (3) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.33k/1.32G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/MinaAlmasi/finetuned-nb-bert-large-8
   fff1015..6e3d1c8  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/MinaAlmasi/finetuned-nb-bert-large-8
   fff1015..6e3d1c8  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7475409836065574}, {'name': 'F1', 'type': 'f1', 'value': 0.7301007787267076}, {'name': 'Precision', 'type': 'precision', 'value': 0.8320598717034926}, {'name': 'Recall', 'type': 'recall', 'value': 0.7475409836065574}]}
