<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/Sentiment_Classification_Using_Adapters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

useing adapters for a sentiment classification task. Instead of fine-tuning all of BERT's parameters, I inserted adapter modules into the model, minimizing training time and memory requirements.

Add Adapter layers after the feed-forward and attention layers of BERT.

Keep the original model weights frozen.

Fine-tune only the adapter layers for the new task.

In [None]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, BertConfig

# Define the Adapter Module
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_projection = nn.Linear(input_dim, adapter_dim)
        self.non_linearity = nn.ReLU()
        self.up_projection = nn.Linear(adapter_dim, input_dim)

    def forward(self, x):
        residual = x  # Save the residual connection
        x = self.down_projection(x)
        x = self.non_linearity(x)
        x = self.up_projection(x)
        return x + residual  # Add residual connection


# Define the Adapter-Enhanced BERT Model
class BertWithAdapters(nn.Module):
    def __init__(self, model_name="bert-base-uncased", adapter_dim=64, num_labels=2):
        super(BertWithAdapters, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.bert.config.output_hidden_states = True  # To get intermediate layers

        # Freeze BERT weights
        for param in self.bert.parameters():
            param.requires_grad = False

        # Add adapters to each transformer layer
        self.adapters = nn.ModuleList(
            [Adapter(self.bert.config.hidden_size, adapter_dim) for _ in range(self.bert.config.num_hidden_layers)]
        )

        # Classification head
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states  # Get all hidden states from BERT

        # Pass each layer's output through its corresponding adapter
        adapted_outputs = []
        for i, hidden_state in enumerate(hidden_states[1:]):  # Skip embedding layer (index 0)
            adapted_output = self.adapters[i](hidden_state)
            adapted_outputs.append(adapted_output)

        # Use the last adapted layer's output for classification
        final_output = adapted_outputs[-1][:, 0, :]  # CLS token output
        logits = self.classifier(final_output)
        return logits


# Load Data and Train the Model
from transformers import AdamW
from datasets import load_dataset

# Load the tokenizer and dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = load_dataset("imdb")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Create DataLoaders
train_loader = torch.utils.data.DataLoader(encoded_dataset["train"], batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(encoded_dataset["test"], batch_size=16)

# Initialize the model
model = BertWithAdapters()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(3):  # Number of epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {correct/total:.2f}")


Key Features of This Implementation
Adapters:
Each transformer layer has its own adapter to introduce trainable parameters while keeping the core model frozen.
They consist of a down-projection, a non-linearity, and an up-projection with residual connections.

Freezing the Pre-Trained Model:
Only adapters and the classification head are trained, making the approach efficient.

Modularity:
The adapter modules can be reused or extended for different transformer architectures or tasks.

Flexibility:
You can tune only specific layers, use different adapter dimensions, or extend adapters for tasks beyond classification.