In [None]:
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

from pathlib import Path
import time

import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [None]:
class IMDBDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256, use_attention_mask=False):
        self.data = pd.read_csv(csv_file)
        self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer)
        self.pad_token_id = pad_token_id
        self.use_attention_mask = use_attention_mask

        # Pre-tokenize texts and create attention masks if required
        self.encoded_texts = [
            tokenizer.encode(text, truncation=True, max_length=self.max_length)
            for text in self.data["text"]
        ]
        self.encoded_texts = [
            et + [pad_token_id] * (self.max_length - len(et))
            for et in self.encoded_texts
        ]

        if self.use_attention_mask:
            self.attention_masks = [
                self._create_attention_mask(et)
                for et in self.encoded_texts
            ]
        else:
            self.attention_masks = None

    def _create_attention_mask(self, encoded_text):
        # fgs sole purpose to id text from padding
        return [1 if token_id != self.pad_token_id else 0 for token_id in encoded_text]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["label"]

        if self.use_attention_mask: attention_mask = self.attention_masks[index]
        else: attention_mask = torch.ones(self.max_length, dtype=torch.long)

        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(attention_mask, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
            )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self, tokenizer):
        max_length = 0
        for text in self.data["text"]:
            encoded_length = len(tokenizer.encode(text))
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length


fgs: The fact that the IMDBDataset class returns a tuple of (encoded_text, attention_mask, label) is not a strict, universal requirement imposed by Hugging Face models, but it is a very common and highly recommended practice when working with padded sequences and models that utilize attention masks, including many models from the Hugging Face library.

Here's why it's a good practice:

1. Hugging Face Model Input: Many Hugging Face models, especially those designed for sequence tasks like classification, accept input_ids (the encoded text) and attention_mask as separate inputs. Providing the attention mask allows the model to correctly handle the padded tokens and focus its attention only on the real content of the sequence.
2. Handling Padding: When you have sequences of varying lengths and pad them to a fixed maximum length, you need a way to tell the model which parts are real data and which are just padding. The attention mask serves this purpose. If you didn't provide an attention mask, the model might incorrectly process the padding tokens as if they were meaningful input, leading to degraded performance.
3. PyTorch DataLoader Compatibility: PyTorch DataLoader is designed to work with datasets that return tensors. By returning a tuple of tensors, your custom dataset seamlessly integrates with the DataLoader, which will then batch these tensors together for efficient processing.
4. Clarity and Organization: Returning the input, attention mask, and label as separate elements in a tuple makes the data structure clear and easy to work with in your training loop. You can easily unpack the tuple to get the individual components needed for your mode

In [None]:
! [ ! -f download_prepare_dataset.py ] && wget https://raw.githubusercontent.com/giordafrancis/llms_from_scratch/refs/heads/main/ch06/03_bonus_imdb-classification/download_prepare_dataset.py


In [None]:
!python download_prepare_dataset.py

In [None]:
mv *.csv sample_data/

In [None]:
ls

fgs: just testing the data loaders, as i'm interested to understand how the data looks like

In [None]:
###############################
# Instantiate dataloaders
###############################

base_path = Path("sample_data/")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
use_attention_mask = True
# use_attention_mask = False
train_dataset = IMDBDataset(
    base_path / "train.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=use_attention_mask
)
val_dataset = IMDBDataset(
    base_path / "validation.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=use_attention_mask
)
test_dataset = IMDBDataset(
    base_path / "test.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=use_attention_mask
)

num_workers = 0
batch_size = 8

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)


In [None]:
#X and y

# Set options to display more content in pandas columns
pd.set_option('display.max_colwidth', None)
train_dataset.data.sample(10)

In [None]:
for b in train_loader: break
# fgs: N X (token_ids, padd attention, target)
b

In [None]:
def calc_loss_batch(input_batch, attention_mask_batch, target_batch, model, device):
    attention_mask_batch = attention_mask_batch.to(device)
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    # logits = model(input_batch)[:, -1, :]  # Logits of last output token
    logits = model(input_batch, attention_mask=attention_mask_batch).logits # fgs padding att mask used, so need to pass this to model
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

In [None]:
# Same as in chapter 5
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if num_batches is None: num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, attention_mask_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, attention_mask_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

@torch.no_grad()  # Disable gradient tracking for efficiency
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, attention_mask_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            attention_mask_batch = attention_mask_batch.to(device)
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            # logits = model(input_batch)[:, -1, :]  # Logits of last output token
            logits = model(input_batch, attention_mask=attention_mask_batch).logits
            predicted_labels = torch.argmax(logits, dim=1)
            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter, max_steps=None):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, attention_mask_batch, target_batch in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, attention_mask_batch, target_batch, model, device)
            loss.backward()  # Calculate loss gradients
            optimizer.step()  # Update model weights using loss gradients
            examples_seen += input_batch.shape[0]  # New: track examples instead of tokens
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

            if max_steps is not None and global_step > max_steps:
                break

        # New: Calculate accuracy after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

        if max_steps is not None and global_step > max_steps: break

    return train_losses, val_losses, train_accs, val_accs, examples_seen



## Define training params

In [24]:
model = "distilbert"
trainable_layers = "all"
use_attention_mask = True
num_epochs = 1
learning_rate = 5e-6

# data loaders params
num_workers = 0
batch_size = 8

In [18]:
model = "modernbert-base"
trainable_layers = "last_block"
use_attention_mask = True
num_epochs = 3
learning_rate = 5e-6

# data loaders params
num_workers = 0
batch_size = 8

## Model selection

In [25]:

model

'distilbert'

In [26]:
torch.manual_seed(123)
if model == "distilbert":
  model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=2)
  model.out_head = torch.nn.Linear(in_features=768, out_features=2)

  for param in model.parameters(): param.requires_grad = False
  if trainable_layers == "last_layer":
    for param in model.out_head.parameters(): param.requires_grad = True
  elif trainable_layers == "last_block":
      for param in model.pre_classifier.parameters(): param.requires_grad = True
      for param in model.distilbert.transformer.layer[-1].parameters(): param.requires_grad = True
  elif trainable_layers == "all":
      for param in model.parameters(): param.requires_grad = True
  else:
      raise ValueError("Invalid --trainable_layers argument.")
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
elif model == "bert":
  model = AutoModelForSequenceClassification.from_pretrained(
           "bert-base-uncased", num_labels=2)
  model.classifier = torch.nn.Linear(in_features=768, out_features=2)
  for param in model.parameters(): param.requires_grad = False
  if trainable_layers == "last_layer":
    for param in model.classifier.parameters(): param.requires_grad = True
  elif trainable_layers == "last_block":
      for param in model.classifier.parameters(): param.requires_grad = True
      for param in model.bert.pooler.dense.parameters(): param.requires_grad = True
      for param in model.bert.encoder.layer[-1].parameters(): param.requires_grad = True
  elif trainable_layers == "all":
    for param in model.parameters(): param.requires_grad = True
  else:
      raise ValueError("Invalid --trainable_layers argument.")
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
elif model == "roberta":
  model = AutoModelForSequenceClassification.from_pretrained(
            "FacebookAI/roberta-large", num_labels=2)
  model.classifier.out_proj = torch.nn.Linear(in_features=1024, out_features=2)
  for param in model.parameters():
      param.requires_grad = False
  if trainable_layers == "last_layer":
      for param in model.classifier.parameters():
          param.requires_grad = True
  elif trainable_layers == "last_block":
      for param in model.classifier.parameters():
          param.requires_grad = True
      for param in model.roberta.encoder.layer[-1].parameters():
          param.requires_grad = True
  elif trainable_layers == "all":
      for param in model.parameters():
          param.requires_grad = True
  else:
      raise ValueError("Invalid --trainable_layers argument.")
  tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

elif model in ("modernbert-base", "modernbert-large"):
  if model == "modernbert-base":
        model = AutoModelForSequenceClassification.from_pretrained(
            "answerdotai/ModernBERT-base", num_labels=2
        )
        model.classifier = torch.nn.Linear(in_features=768, out_features=2)
  else:
        model = AutoModelForSequenceClassification.from_pretrained(
            "answerdotai/ModernBERT-large", num_labels=2
        )
        model.classifier = torch.nn.Linear(in_features=1024, out_features=2)
  for param in model.parameters():
      param.requires_grad = False
  if trainable_layers == "last_layer":
      for param in model.classifier.parameters():
          param.requires_grad = True
  elif trainable_layers == "last_block":
      for param in model.classifier.parameters():
          param.requires_grad = True
      for param in model.model.layers[-1].parameters():
          param.requires_grad = True
      for param in model.head.parameters():
          param.requires_grad = True
      # for param in model.classifier.parameters():
      #     param.requires_grad = True
  elif trainable_layers == "all":
      for param in model.parameters():
          param.requires_grad = True
  else:
      raise ValueError("Invalid --trainable_layers argument.")

  tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

elif model == "deberta-v3-base":
  model = AutoModelForSequenceClassification.from_pretrained(
        "microsoft/deberta-v3-base", num_labels=2)
  model.classifier = torch.nn.Linear(in_features=768, out_features=2)
  for param in model.parameters():
      param.requires_grad = False
  if trainable_layers == "last_layer":
      for param in model.classifier.parameters():
          param.requires_grad = True
  elif trainable_layers == "last_block":
    for param in model.classifier.parameters():
        param.requires_grad = True
    for param in model.pooler.parameters():
        param.requires_grad = True
    for param in model.deberta.encoder.layer[-1].parameters():
        param.requires_grad = True
  elif trainable_layers == "all":
    for param in model.parameters():
      param.requires_grad = True
  else:
      raise ValueError("Invalid --trainable_layers argument.")

  tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

else:
  raise ValueError("Selected --model {model} not supported.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Dataset and Data Loading

In [27]:
base_path = Path("sample_data/")

train_dataset = IMDBDataset(
    base_path / "train.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=use_attention_mask
)
val_dataset = IMDBDataset(
    base_path / "validation.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=use_attention_mask
)
test_dataset = IMDBDataset(
    base_path / "test.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=use_attention_mask
)


train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)


## Train

In [28]:
###############################
# Train model
###############################

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr= learning_rate,
                              weight_decay=0.1)

train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=20,
    max_steps=None
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")


Ep 1 (Step 000000): Train loss 0.694, Val loss 0.693
Ep 1 (Step 000050): Train loss 0.677, Val loss 0.677
Ep 1 (Step 000100): Train loss 0.626, Val loss 0.607
Ep 1 (Step 000150): Train loss 0.409, Val loss 0.416
Ep 1 (Step 000200): Train loss 0.390, Val loss 0.310
Ep 1 (Step 000250): Train loss 0.420, Val loss 0.390
Ep 1 (Step 000300): Train loss 0.252, Val loss 0.272
Ep 1 (Step 000350): Train loss 0.319, Val loss 0.272
Ep 1 (Step 000400): Train loss 0.321, Val loss 0.259
Ep 1 (Step 000450): Train loss 0.230, Val loss 0.250
Ep 1 (Step 000500): Train loss 0.272, Val loss 0.280
Ep 1 (Step 000550): Train loss 0.291, Val loss 0.260
Ep 1 (Step 000600): Train loss 0.208, Val loss 0.228
Ep 1 (Step 000650): Train loss 0.257, Val loss 0.238
Ep 1 (Step 000700): Train loss 0.339, Val loss 0.237
Ep 1 (Step 000750): Train loss 0.280, Val loss 0.220
Ep 1 (Step 000800): Train loss 0.257, Val loss 0.217
Ep 1 (Step 000850): Train loss 0.250, Val loss 0.218
Ep 1 (Step 000900): Train loss 0.193, Val loss

## Model eval

In [22]:
###############################
# Evaluate model
###############################

print("\nEvaluating on the full datasets ...\n")

train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")


Evaluating on the full datasets ...

Training accuracy: 86.66%
Validation accuracy: 86.80%
Test accuracy: 86.10%


## Results


fgs: DistellBert seems a really good starting point model
trainable_layers = "last_block"
use_attention_mask = True
num_epochs = 1
learning_rate = 5e-6

**DistillBert: Evaluating on the full datasets ...**

- Training accuracy: 88.56%
- Validation accuracy: 88.88%
- Test accuracy: 88.84%


**modernbert-base: Evaluating on the full datasets ...**
epoch == 1

- Training accuracy: 82.98%
- Validation accuracy: 83.26%
- Test accuracy: 82.36%

epoch == 3

Training accuracy: 86.66%
Validation accuracy: 86.80%
Test accuracy: 86.10%