----
# BERT fine-tuning from scratch: Coding the Modifed LoRA and evaluating the model on Sentiment Analysis

In this notebook, we will finetune a BERT model to perform sentiment analysis.

----

### Implementing our modified version of LoRA

This is **done** in the file `models.py`.

### Loading the dataset

In [1]:
import os
from datasets import load_dataset

import pandas as pd
import torch

from processing.dataset_utils import download_dataset, load_dataset_into_to_dataframe, partition_dataset, IMDBDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [5]:
files = ("test.csv", "train.csv", "val.csv")
download = True

for f in files:
    if not os.path.exists(os.path.join("data/sentiment", f)):
        download = False

if download is False:
    download_dataset()
    df = load_dataset_into_to_dataframe()
    partition_dataset(df)

In [6]:
df_train = pd.read_csv(os.path.join("data/sentiment", "train.csv"))
df_val = pd.read_csv(os.path.join("data/sentiment", "val.csv"))
df_test = pd.read_csv(os.path.join("data/sentiment", "test.csv"))

In [7]:
df_test.head(5)

Unnamed: 0,index,text,label
0,0,i really did not watch this show as often when...,1
1,0,"OK, I got the DVD set last week and I am final...",1
2,0,"This is a bizarre oddity, directed by the guy ...",1
3,0,i liked this movie a lot.I rented this expecti...,1
4,0,I never saw Doctor Who before (at least not in...,1


### 2- Tokenization

In [8]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": os.path.join("data/sentiment", "train.csv"),
        "validation": os.path.join("data/sentiment", "val.csv"),
        "test": os.path.join("data/sentiment", "test.csv"),
    },
)

print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [10]:
def tokenize_text(batch, truncation = True):
    return tokenizer(batch["text"], truncation=truncation, padding=True)

In [11]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:08<00:00, 3947.60 examples/s]
Map: 100%|██████████| 5000/5000 [00:01<00:00, 4747.43 examples/s]
Map: 100%|██████████| 10000/10000 [00:02<00:00, 3674.41 examples/s]


In [12]:
imdb_tokenized

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [13]:
import numpy as np
X = np.array(imdb_tokenized['train']['input_ids'])
print(X.shape)

(35000, 512)


In [14]:
del imdb_dataset

In [15]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [16]:
imdb_tokenized

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

### 3- Set up DataLoaders

In [17]:
from torch.utils.data import DataLoader

train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=1
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=32,
    num_workers=1
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    num_workers=1
)

### 4- Initializing BERT

In [5]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [7]:
from model import *
# LoRA parameters
rank = 8
alpha = 0.5
alpha_r = rank

# Replacing Linear with LoRA
replace_lora_roberta(model, rank, alpha, alpha_r,device, train_alpha= True)

In [8]:
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): LoRALinear(
                (linear): Linear(in_features=768, out_features=768, bias=True)
              )
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): LoRALinear(
                (linear): Linear(in_features=768, out_features=768, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (outpu

In [9]:
# Check if LoRA was introduced correctly: Linear layers frozen and A, B trainable
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

roberta.embeddings.word_embeddings.weight: False
roberta.embeddings.position_embeddings.weight: False
roberta.embeddings.token_type_embeddings.weight: False
roberta.embeddings.LayerNorm.weight: False
roberta.embeddings.LayerNorm.bias: False
roberta.encoder.layer.0.attention.self.query.lora_A: True
roberta.encoder.layer.0.attention.self.query.lora_B: True
roberta.encoder.layer.0.attention.self.query.alpha: True
roberta.encoder.layer.0.attention.self.query.linear.weight: False
roberta.encoder.layer.0.attention.self.query.linear.bias: False
roberta.encoder.layer.0.attention.self.key.weight: False
roberta.encoder.layer.0.attention.self.key.bias: False
roberta.encoder.layer.0.attention.self.value.lora_A: True
roberta.encoder.layer.0.attention.self.value.lora_B: True
roberta.encoder.layer.0.attention.self.value.alpha: True
roberta.encoder.layer.0.attention.self.value.linear.weight: False
roberta.encoder.layer.0.attention.self.value.linear.bias: False
roberta.encoder.layer.0.attention.output.

In [11]:
from model import *
lora_params, alpha_params = optimize_lora(model)

In [17]:
#print(model.classifier.alpha.detach().cpu().numpy())
model.roberta.encoder.layer[0].attention.self.query.alpha.item()

0.5

In [29]:
total_params = sum(p.numel() for p in model.parameters())
lora_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The total number of parameters of the BERT model is : {total_params}")
print(f"The number of trainable parameters after applying LoRA : {lora_params}")

The total number of parameters of the BERT model is : 67637010
The number of trainable parameters after applying LoRA : 682000


### 5- Finetuning

In [30]:
loader = {'train': train_loader, 'val': val_loader, 'test': test_loader}

In [31]:
@torch.no_grad
def evaluate_model(model):
    out = {}
    model.eval()

    for split in ['val', 'test']:
        data_loader = loader[split]
        acc = 0
        loss = 0
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Perform a forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            # Calculate the loss
            loss += outputs.loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            acc += (predictions == labels).sum().item()
        out[split + '_acc'] = acc / len(data_loader)
        out[split + '_loss'] = loss / len(data_loader)
    return out


In [None]:
from tqdm.auto import tqdm
from torch.optim import AdamW

def train(model, params):
    # Set up the optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'])

    # Move the model to the specified device (GPU or CPU)
    model.to(device)
    n = len(train_loader)

    for epoch in range(params['epochs']):
        # Training over this epoch
        model.train()
        train_loss = 0
        train_acc = 0

        # Progress bar for the training phase
        train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{params['epochs']} [Training]")

        for i, batch in enumerate(train_progress_bar):

            if i % params['inter_eval'] == 0:
                # Evaluate the model
                evals = evaluate_model(model)
                print(f'Epoch {epoch} / {params["epochs"]}, Step: {i} / {n} :')
                print(f'Val Accuracy = {evals["val_acc"]}, Val Loss = {evals["val_loss"]},  Test Accuracy = {evals["test_acc"]}, Test Loss = {evals["test_loss"]}')
                model.train()

            # Move batch data to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Zero out the gradients from the previous iteration
            model.zero_grad()

            # Perform a forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            # Calculate the loss
            loss = outputs.loss
            train_loss += loss.item() / n

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update the model's weights
            optimizer.step()

            # Update the progress bar
            train_progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})

            # Update Train accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            train_acc += (predictions == labels).sum().item() / n
        print(f'Finished Epoch {epoch} / {params["epochs"]}: Train Loss = {train_loss}, Train Accuracy = {train_acc}')
        print("-" * 50)


In [None]:
# Parameters
# Do not launch this if you don't have GPU
params = {'learning_rate': 1e-3,
          'epochs': 100,
          'inter_eval': 50}
train(model, params)

## Trying to quantify the correlation between the tasks

In [6]:
# Getting the embeddings from the BERT model
def get_embedding_bert(sentence):
    def mean_pooling(model_output, attention_mask):
        # The model_output is a tuple. The first element contains the token embeddings.
        token_embeddings = model_output[0] # or model_output.last_hidden_state
        
        # Expand the attention mask to match the size of the token embeddings.
        # This is needed to correctly mask the padding tokens.
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        
        # Sum the embeddings, but only for the actual tokens (not padding).
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        
        # Sum the attention mask to get the number of actual tokens.
        # We clamp the sum to a minimum of 1e-9 to avoid division by zero.
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Divide the sum of embeddings by the number of tokens to get the mean.
        return sum_embeddings / sum_mask
    
    # 1. Tokenize the input sentence.
    #    - `return_tensors='pt'`: Return PyTorch tensors.
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

    # 2. Feed the tokenized input to the model.
    #    - `torch.no_grad()` is used to disable gradient calculations, which saves memory
    #      and speeds up computation, as we are not training the model.
    with torch.no_grad():
        model_output = model(**encoded_input) # model_output.last_hidden_state.shape = (1, sentence_men, embed_dim)
        
    # 3. Perform mean pooling on the token embeddings.
    #    This will generate a single vector representation for the sentence.
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    # The result is a tensor of shape [1, embedding_dim]. We squeeze it to get a 1D tensor.
    return sentence_embedding.squeeze()

In [37]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

topic_1 = 'Sentiment analysis'
topic_2 = 'Safety alignment'

# Tokenizing the sentences
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Get the embeddings
mu_1 = get_embedding_bert(topic_1).detach().numpy()
mu_2 = get_embedding_bert(topic_2).detach().numpy()

# Making the vectors of norm 1
mu_1 = mu_1 / np.linalg.norm(mu_1)
mu_2 = mu_2 / np.linalg.norm(mu_2)


In [38]:
# Getting the alignement of topic_2 with respect to topic_1
beta = np.sum(mu_1 * mu_2)
print(beta)

0.8394884


**Conclusion:** This approach is not performing super well with the distilbert model, as it gives a relatively high alignment score even for non-correlated tasks. Example, try Sentiment analysis with Image classification.

We are trying now another BERT model that is specifically trained to produce *semantically meaningful embeddings*.

In [54]:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

topic_1 = 'Cars classification'
topic_2 = 'Animals classification'

mu_1 = sentence_model.encode(topic_1)
mu_2 = sentence_model.encode(topic_2)

# Normalizing the vectors
mu_1 = mu_1 / np.linalg.norm(mu_1)
mu_2 = mu_2 / np.linalg.norm(mu_2)

# Getting the alignement of topic_2 with respect to topic_1
beta = np.sum(mu_1 * mu_2)
print(beta)

0.5297174


**Remark:** This model seems to be okey. There is a logic in its results as it preserves the *ordering* of the alignment scores $\beta$: lower scores to less correlated tasks.