<a href="https://colab.research.google.com/github/diegomrodrigues/llm/blob/main/BERT_Fine_Tunning_for_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install transformers datasets huggingface_hub tensorboardX accelerate --upgrade



In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import notebook_login
from huggingface_hub import HFSummaryWriter

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers.integrations import TensorBoardCallback
import time
import os


In [5]:
def load_and_preprocess_data(tokenizer):
    # Load the IMDB dataset
    imdb = load_dataset("imdb")

    # Preprocess the dataset
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

    # Encode the dataset
    imdb_encoded = imdb.map(preprocess_function, batched=True, remove_columns=["text"])

    return imdb_encoded

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

def train_model(imdb_encoded, tokenizer, hf_writer):
    # Create a data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=5,
        evaluation_strategy='steps',
        eval_steps=5,
        save_strategy='steps',
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        report_to=["tensorboard"]
    )

    # Load the BERT model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Create the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=imdb_encoded['train'],
        eval_dataset=imdb_encoded['test'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[TensorBoardCallback(hf_writer)]
    )

    # Train the model
    trainer.train()

    # Save the trained model
    trainer.save_model("./trained_model")

def main():
    # Logger to HF
    hf_writer = HFSummaryWriter(repo_id="test_hf_logger")

    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load and preprocess the data
    imdb_encoded = load_and_preprocess_data(tokenizer)

    # Train the model
    train_model(imdb_encoded, tokenizer, hf_writer)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
5,0.679,0.686447,0.55664,0.53456,0.562577,0.5092
10,0.7056,0.685498,0.56244,0.570767,0.560108,0.58184
15,0.6944,0.68383,0.57288,0.60127,0.563796,0.64408
20,0.6911,0.681003,0.5872,0.621645,0.573768,0.67824
25,0.6839,0.677685,0.6044,0.645011,0.584961,0.7188
30,0.6758,0.672793,0.63212,0.649304,0.620328,0.68112
35,0.6627,0.667165,0.65888,0.657124,0.660524,0.65376
40,0.68,0.65951,0.68084,0.65113,0.717964,0.59568
45,0.6649,0.650503,0.69972,0.675317,0.735053,0.62456
50,0.6789,0.640762,0.72228,0.720006,0.725949,0.71416


KeyboardInterrupt: 