In [1]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizer
from datasets import load_dataset
from huggingface_hub import notebook_login
from sklearn.metrics import accuracy_score, f1_score
from utils import tokenize
import torch

In [2]:
num_labels = 77
model_ckpt = "distilbert-base-uncased" # model from huggingface
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.we

In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [4]:
banking = load_dataset("banking77")
tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)
encoded_banking = banking.map(
    tokenize, batched=True, batch_size=None, fn_kwargs={"tokenizer": tokenizer}
)

Found cached dataset banking77 (/home/fabio/.cache/huggingface/datasets/banking77/default/1.1.0/9898c11f6afa9521953d2ef205667b527bad14ef9cab445d470f16240c8c8ec4)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/fabio/.cache/huggingface/datasets/banking77/default/1.1.0/9898c11f6afa9521953d2ef205667b527bad14ef9cab445d470f16240c8c8ec4/cache-0eab759da27d28f6.arrow
Loading cached processed dataset at /home/fabio/.cache/huggingface/datasets/banking77/default/1.1.0/9898c11f6afa9521953d2ef205667b527bad14ef9cab445d470f16240c8c8ec4/cache-ce9b66165e5e501d.arrow


In [5]:
train = encoded_banking['train'].train_test_split(test_size=0.2, shuffle=True)
encoded_banking['train'] = train['train']
encoded_banking['validation'] = train['test']
del train;

In [6]:
encoded_banking

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8002
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3080
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2001
    })
})

In [7]:
batch_size = 64
logging_steps = len(encoded_banking['train']) // batch_size
model_name = f'{model_ckpt}-finetuned-banking'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level='error')

In [8]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=encoded_banking['train'],
                  eval_dataset=encoded_banking['validation'],
                  tokenizer=tokenizer)
trainer.train()

/home/fabio/code/llm_banking/distilbert-base-uncased-finetuned-banking is already a clone of https://huggingface.co/kaladin11/distilbert-base-uncased-finetuned-banking. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,4.0082,3.414004,0.485257,0.438369
2,2.9993,2.430354,0.661669,0.622562
3,2.172,1.718919,0.757121,0.736442
4,1.5765,1.249432,0.812594,0.801441
5,1.1802,0.960559,0.857071,0.848879
6,0.9069,0.781548,0.882059,0.879114
7,0.7459,0.67485,0.886557,0.88439
8,0.6213,0.605177,0.902049,0.900367
9,0.5635,0.568564,0.902549,0.900837
10,0.5226,0.557399,0.904048,0.902482


Several commits (4) will be pushed upstream.


TrainOutput(global_step=1260, training_loss=1.5216158473302448, metrics={'train_runtime': 154.3483, 'train_samples_per_second': 518.438, 'train_steps_per_second': 8.163, 'total_flos': 2031627853938480.0, 'train_loss': 1.5216158473302448, 'epoch': 10.0})