<a href="https://colab.research.google.com/github/diegomrodrigues/llm/blob/main/BERT_Fine_Tunning_for_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install transformers datasets huggingface_hub tensorboardX accelerate --upgrade



In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import notebook_login
from huggingface_hub import HFSummaryWriter

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers.integrations import TensorBoardCallback
import time
import os


In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch

def load_and_preprocess_data(tokenizer):
    # Load the IMDB dataset
    imdb = load_dataset("imdb")

    # Preprocess the dataset
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

    # Encode the dataset
    imdb_encoded = imdb.map(preprocess_function, batched=True, remove_columns=["text"])

    return imdb_encoded

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

def train_model(imdb_encoded, tokenizer, hf_writer):
    # Create a data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=32,  # Increase batch size for better performance
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=5,
        evaluation_strategy='steps',
        eval_steps=5,
        save_strategy='steps',
        save_steps=5,
        save_total_limit=5, # Only keep the 5 most recent checkpoints
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        report_to=["tensorboard"],
        fp16=True,  # Enable mixed precision training
        gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch size
        gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    )

    # Load the BERT model
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Create the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=imdb_encoded['train'],
        eval_dataset=imdb_encoded['test'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[TensorBoardCallback(hf_writer)]
    )

    #trainer.push_to_hub(commit_message="Upload BERT model")

    # Train the model
    with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):  # Enable Flash Attention for faster training
        trainer.train(resume_from_checkpoint='/content/results/checkpoint-35')

    # Save the trained model
    trainer.save_model("./trained_model")

def main():
    # Logger to HF
    hf_writer = HFSummaryWriter(repo_id="BERT-IMDB-Sentiment-Finetuned")

    # Load the BERT tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    # Load and preprocess the data
    imdb_encoded = load_and_preprocess_data(tokenizer)

    # Train the model
    train_model(imdb_encoded, tokenizer, hf_writer)

if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]