<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/fusion_technique_to_combine_the_outputs_of_different_hidden_layers_from_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We use a layer fusion technique to combine the outputs of different hidden layers from a pre-trained BERT model for a classification task. This example will demonstrate how to implement a simple attention mechanism to fuse layer outputs, potentially **improving the model's ability to integrate information across layers**

Prerequisites:
A pre-trained BERT model (using Hugging Face's Transformers library)
PyTorch for model customization and training

In [None]:
!pip install torch transformers




In [None]:
import torch
from torch import nn
from transformers import BertModel, BertTokenizer

class BertWithLayerFusion(nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super(BertWithLayerFusion, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name, output_hidden_states=True)
        self.num_labels = num_labels

        # Attention mechanism for layer fusion
        self.attention = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.Tanh(),
            nn.Linear(128, 1),
            nn.Softmax(dim=1)
        )

        # Classifier on top of the fused layer output
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)

        # Get all hidden states
        hidden_states = outputs.hidden_states  # Tuple of hidden states from all layers

        # Stack and transpose to get (batch_size, num_layers, seq_length, hidden_size)
        stack_hidden_states = torch.stack(hidden_states[1:], dim=1)  # Exclude the embedding layer

        # Apply attention to layers (assuming equal attention to all sequence tokens)
        weights = self.attention(stack_hidden_states.mean(dim=2))  # Reduce seq_length dimension
        fused_hidden_state = torch.sum(weights * stack_hidden_states, dim=1)

        logits = self.classifier(fused_hidden_state[:,0,:])  # Use the [CLS] token

        return logits

# Example usage
model_name = 'bert-base-uncased'
num_labels = 2  # For binary classification
model = BertWithLayerFusion(model_name, num_labels)
tokenizer = BertTokenizer.from_pretrained(model_name)


In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import torch
from torch import nn

# Replace this:
# from transformers import AdamW

# With this:
from torch.optim import AdamW


# Load the IMDb dataset
dataset = load_dataset("imdb", split='train[:10%]')  # Using only 10% for quick demonstration

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize the dataset
#def tokenize_function(examples):
    #return tokenizer(examples["text"], truncation=True, padding=False)

def tokenize_function(examples):
    # Ensure padding and truncation are enabled
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)  # Adjust max_length as needed


tokenized_datasets = dataset.map(tokenize_function, batched=True)

from transformers import DataCollatorWithPadding
# Use a data collator to dynamically pad batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
# Prepare DataLoader
train_dataloader = DataLoader(tokenized_datasets, batch_size=8, collate_fn=data_collator)



Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

The message indicates that the BertForSequenceClassification model has some weights (specifically in the classifier layer) that were not pre-trained and have been initialized randomly. This is expected when adapting a pre-trained model for a new task (like sequence classification) that it wasn't originally trained for. To resolve this and effectively use the model for predictions or inference on your specific task, you should train (fine-tune) the model on a relevant dataset. This process involves running a training loop where the model learns from your task-specific data, adjusting those newly initialized weights to be effective for your classification task.

To train a model on a downstream task, you typically follow these steps: First, choose a specific task and corresponding dataset, such as sentiment analysis with the IMDb dataset. Next, preprocess the dataset for the model, including tokenization and formatting input data. Then, define a loss function and optimizer, and fine-tune the model by training it on the dataset, adjusting weights based on the loss. Finally, validate the model on a separate test set to evaluate its performance. This process adapts the pre-trained model to your specific task, enhancing its predictions and inference capabilities.

In [None]:
from transformers import get_scheduler
from torch.utils.data import DataLoader
import torch

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

print("Training completed.")


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

# Dummy DataLoader for demonstration
train_dataloader = DataLoader(...)  # Assume it yields input_ids and labels

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for batch in train_dataloader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()
    logits = model(input_ids=input_ids)

    loss = nn.CrossEntropyLoss()(logits, labels)
    loss.backward()
    optimizer.step()

print("Training completed")




TypeError: object of type 'ellipsis' has no len()