# Finetuning a DistilBERT Classifier in Lightning

![](figures/finetuning-ii.png)

In [23]:
%load_ext watermark
%watermark -p torch,transformers,datasets,lightning

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
torch       : 2.1.2+cu121
transformers: 4.37.2
datasets    : 2.17.0
lightning   : 2.1.4



# 1 Loading the dataset into DataFrames

In [24]:
# pip install datasets

import os.path as op

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import torch

from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [25]:
#download_dataset()

#df = load_dataset_into_to_dataframe()
#partition_dataset(df)

In [26]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [27]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


**Tokenize the dataset**

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [29]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [30]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

In [31]:
del imdb_dataset

In [32]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [33]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [34]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [35]:
class DataModule(L.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size

    def setup(self, stage: str):
        self.train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
        self.val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
        self.test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

    def train_dataloader(self):
        
        train_loader = DataLoader(
            dataset=self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True, 
            num_workers=4
        )
        return train_loader

    def val_dataloader(self):
        
        val_loader = DataLoader(
            dataset=self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False, 
            num_workers=4
        )
        return val_loader
    
    def test_dataloader(self):
        
        test_loader = DataLoader(
            dataset=self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False, 
            num_workers=4
        )
        return test_loader

# 4 Initializing DistilBERT

In [36]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5 Finetuning

**Wrap in LightningModule for Training**

In [37]:
import lightning as L
import torch
import torchmetrics


class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)
        
    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])        
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])        
        self.log("val_loss", outputs["loss"], prog_bar=True)
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])        
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    

lightning_model = LightningModel(model)

In [38]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    ),  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [41]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    devices=1,
    precision="bf16",
    logger=logger,
    log_every_n_steps=10,
)

/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/fabric/connector.py:558: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
Using bfloat16 Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [42]:
from lightning.pytorch.tuner import Tuner

tuner = Tuner(trainer)

dm = DataModule(batch_size=9999)

tuner.scale_batch_size(lightning_model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Batch size 2 failed, trying batch size 1
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 1 succeeded, trying batch size 2
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 2 succeeded, trying batch size 4
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 4 succeeded, trying batch size 8
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 8 succeeded, trying batch size 16
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 16 succeeded, trying batch size 32
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 32 succeeded, trying batch size 64
`Trainer.fit` stopped: `max_steps=3` reached.
Batch size 64 succeeded, trying batch size 128
Batch size 128 failed, trying batch size 64
Finished batch size finder, will continue with full run using batch size 64
Restoring states from the checkpoint path at /teamspace/studios/this_studio/.scale_batch_size_68c67fa3-fc87-4121-9971-585b4b46d9e4.ckpt
Restored all stat

64

In [43]:
dm = DataModule(batch_size=64)

In [44]:
import time

start = time.time()

trainer.fit(model=lightning_model,
            datamodule=dm)

end = time.time()
print(f"Time elapsed: {(end-start)/60:.2f} min")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name     | Type                                | Params
-----------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.0 M
1 | val_acc  | MulticlassAccuracy                  | 0     
2 | test_acc | MulticlassAccuracy                  | 0     
-----------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Time elapsed: 14.65 min


In [45]:
trainer.test(lightning_model, datamodule=dm, ckpt_path="best")

Restoring states from the checkpoint path at logs/my-model/version_6/checkpoints/epoch=2-step=1641.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from the checkpoint at logs/my-model/version_6/checkpoints/epoch=2-step=1641.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.9261999726295471}]