# Bert/Transformer-based model Finetuning: banking77 dataset

## Packages

In [1]:
# Load packages
from dotenv import load_dotenv

load_dotenv()
# Dataset
from datasets import load_dataset
# For EDA
from collections import Counter
import matplotlib.pyplot as plt
# For Model training
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
# For Evaluation
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


## Dataset 

In [2]:
# Training info
dataset_id = "PolyAI/banking77"  # Training dataset
models = ["bert-base-uncased","bert-base-cased","bert-large-uncased","bert-large-cased"]  # For tokenize and modeling

# Load everythings for training
dataset = load_dataset(dataset_id)

# Check dataset
print(dataset) # train: 10003 rows | test: 3080 rows

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})


In [3]:
lable_mapping_dict = {index:l for index,l in enumerate(dataset['train'].features['label'].names)}

## Data Processing

### Tokenization

In [4]:
# tokenize函數 用來批次將資料轉換成input tokens
def tokenize_function(data,model_id):
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    return tokenizer(
        data["text"],
        add_special_tokens=True, # Default = True [CLS], [SEP]
        padding="max_length", # 補值到模型可接受的最大值
        max_length=64, # 設定裁切長度
        #  truncation=True # 是否裁切(要設定max_length才會裁切)
    )

### Metrics

In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
    }

## Training

In [9]:
results={}

In [10]:
model_id = models[0]

# Tokenize
tokenized_datasets = dataset.map(lambda x: tokenize_function(x,model_id), batched=True) # 將dataset轉換為input tokens

# Train-Valid-Test split
train_test_split = tokenized_datasets["train"].shuffle(seed=86).train_test_split(test_size=0.1, seed=86)
train_dataset, valid_dataset = train_test_split["train"], train_test_split["test"]
# Test dataset remains the same
test_dataset = tokenized_datasets["test"].shuffle(seed=86)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=77)

# 調整的訓練參數
training_args = TrainingArguments(
    output_dir="result_bert/{}".format(model_id),
    logging_dir='./logs',
    evaluation_strategy="epoch",         # 每個 epoch 評估一次
    save_strategy="epoch",               # 每個 epoch 保存一次模型
    
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=200,
    save_total_limit=2,                  # 最多保留2個模型checkpoint
    load_best_model_at_end=True,         # 在訓練結束時加載最佳模型
    metric_for_best_model="accuracy",    # 根據準確率選擇最佳模型
    label_smoothing_factor=0.1
)

print('------------------------------------{}'.format(model_id))
# 計算參數總數和可訓練參數數量
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

# Set Trainer
trainer1 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset ,
    compute_metrics=compute_metrics
)

trainer1.train()
results[model_id] = trainer1.evaluate(test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------------------------------------bert-base-uncased
Total parameters: 109541453
Trainable parameters: 109541453


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.5661,1.798806,0.753247,0.72541
2,1.215,1.073677,0.904096,0.903754
3,0.9336,1.009164,0.922078,0.921675
4,0.8532,0.986852,0.927073,0.926787
5,0.8092,0.979236,0.931069,0.931577


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
model_id = models[1]

# Tokenize
tokenized_datasets = dataset.map(lambda x: tokenize_function(x,model_id), batched=True) # 將dataset轉換為input tokens

# Train-Valid-Test split
train_test_split = tokenized_datasets["train"].shuffle(seed=86).train_test_split(test_size=0.1, seed=86)
train_dataset, valid_dataset = train_test_split["train"], train_test_split["test"]
# Test dataset remains the same
test_dataset = tokenized_datasets["test"].shuffle(seed=86)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=77)

# 調整的訓練參數
training_args = TrainingArguments(
    output_dir="result_bert/{}".format(model_id),
    logging_dir='./logs',
    evaluation_strategy="epoch",         # 每個 epoch 評估一次
    save_strategy="epoch",               # 每個 epoch 保存一次模型
    
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=200,
    save_total_limit=2,                  # 最多保留2個模型checkpoint
    load_best_model_at_end=True,         # 在訓練結束時加載最佳模型
    metric_for_best_model="accuracy",    # 根據準確率選擇最佳模型
    label_smoothing_factor=0.1
)

print('------------------------------------{}'.format(model_id))
# 計算參數總數和可訓練參數數量
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

# Set Trainer
trainer2 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset ,
    compute_metrics=compute_metrics
)

trainer2.train()
results[model_id] = trainer2.evaluate(test_dataset)

Map: 100%|██████████| 10003/10003 [00:04<00:00, 2431.36 examples/s]
Map: 100%|██████████| 3080/3080 [00:01<00:00, 2270.32 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------------------------------------bert-base-cased
Total parameters: 108369485
Trainable parameters: 108369485


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.1855,1.576709,0.776224,0.758308
2,1.1633,1.070966,0.899101,0.900079
3,0.9246,1.011657,0.926074,0.925368
4,0.8393,0.972451,0.942058,0.941487
5,0.7983,0.971478,0.937063,0.936831


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
results

{'bert-base-uncased': {'eval_loss': 0.9899234771728516,
  'eval_accuracy': 0.9334415584415584,
  'eval_f1': 0.9332756310648983,
  'eval_runtime': 22.5818,
  'eval_samples_per_second': 136.393,
  'eval_steps_per_second': 4.295,
  'epoch': 5.0},
 'bert-base-cased': {'eval_loss': 0.9882162809371948,
  'eval_accuracy': 0.9321428571428572,
  'eval_f1': 0.9320377781510449,
  'eval_runtime': 22.6068,
  'eval_samples_per_second': 136.242,
  'eval_steps_per_second': 4.291,
  'epoch': 5.0}}