italicized text

In [12]:
!pip install transformers datasets torch matplotlib evaluate



In [13]:
from transformers import BertTokenizer
from datasets import load_dataset
import evaluate

# Load the SST-2, SQuAD, and CoNLL-2003 datasets
dataset_sst2 = load_dataset("glue", "sst2")
dataset_squad = load_dataset("squad")
dataset_conll = load_dataset("conll2003")

# Load metric (accuracy)
metric = evaluate.load("accuracy")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")




In [14]:
# Tokenization for SST-2
def tokenize_sst2(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_sst2 = dataset_sst2.map(tokenize_sst2, batched=True)
train_sst2 = tokenized_sst2["train"]
eval_sst2 = tokenized_sst2["validation"]


In [15]:
# Tokenization for SQuAD
def tokenize_squad(examples):
    return tokenizer(
        examples["question"], examples["context"], truncation=True, padding="max_length", max_length=384
    )

tokenized_squad = dataset_squad.map(tokenize_squad, batched=True)
train_squad = tokenized_squad["train"]
eval_squad = tokenized_squad["validation"]


In [16]:
# Tokenization for CoNLL-2003
def tokenize_conll(examples):
    return tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

tokenized_conll = dataset_conll.map(tokenize_conll, batched=True)
train_conll = tokenized_conll["train"]
eval_conll = tokenized_conll["validation"]


In [17]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertForQuestionAnswering, Trainer, TrainingArguments

# Define the modified model class to allow adding different types of layers
class ModifiedBertModel(nn.Module):
    def __init__(self, original_model, additional_attn_layers=0, additional_ff_layers=0, additional_embed_layers=0):
        super(ModifiedBertModel, self).__init__()
        self.bert = original_model.bert  # Use the pre-trained BERT model
        self.dropout = nn.Dropout(0.1)
        self.classifier = original_model.classifier

        # Add additional self-attention layers
        self.extra_attn_layers = nn.ModuleList(
            [nn.TransformerEncoderLayer(d_model=768, nhead=12) for _ in range(additional_attn_layers)]
        )

        # Add additional feed-forward layers
        self.extra_ff_layers = nn.ModuleList(
            [nn.Linear(768, 768) for _ in range(additional_ff_layers)]
        )

        # Add additional embedding layers
        self.extra_embed_layers = nn.ModuleList(
            [nn.Embedding(30522, 768) for _ in range(additional_embed_layers)]
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]

        # Pass through additional embedding layers
        for layer in self.extra_embed_layers:
            sequence_output = layer(input_ids)

        # Pass through additional self-attention layers
        for layer in self.extra_attn_layers:
            sequence_output = layer(sequence_output)

        # Pass through additional feed-forward layers
        for layer in self.extra_ff_layers:
            sequence_output = layer(sequence_output)

        pooled_output = sequence_output[:, 0]  # Taking the [CLS] token's representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits


In [18]:
from transformers import Trainer, TrainingArguments
import os

# Function to train the model and use checkpointing
def train_with_checkpointing(task_name, model, train_dataset, eval_dataset, output_dir, num_epochs=2, additional_layers=0):
    # Check if there is an existing checkpoint
    last_checkpoint = None
    if os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0:
        last_checkpoint = output_dir
        print(f"Resuming from checkpoint: {last_checkpoint}")

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",  # Make eval strategy match save strategy
        save_strategy="steps",  # Set save strategy to steps
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        save_steps=500,  # Save the model every 500 steps
        save_total_limit=3,  # Keep only the last 3 checkpoints
        logging_dir=f"./logs_{task_name}",
        logging_steps=100,
        load_best_model_at_end=True,  # Load the best model after training
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None,  # Resume training from the last checkpoint
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=lambda p: metric.compute(predictions=p.predictions.argmax(-1), references=p.label_ids),
    )

    trainer.train(resume_from_checkpoint=last_checkpoint)
    eval_result = trainer.evaluate()
    return eval_result['eval_accuracy']


In [19]:
def train_sst2_with_layers(num_layers):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    modified_model = ModifiedBertModel(model, additional_attn_layers=num_layers)

    output_dir = f"./sst2_checkpoints_{num_layers}_layers"
    return train_with_checkpointing("sst2", modified_model, train_sst2, eval_sst2, output_dir)


In [20]:
def train_squad_with_layers(num_layers):
    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
    modified_model = ModifiedBertModel(model, additional_attn_layers=num_layers)

    output_dir = f"./squad_checkpoints_{num_layers}_layers"
    return train_with_checkpointing("squad", modified_model, train_squad, eval_squad, output_dir)


In [21]:
def train_conll_with_layers(num_layers):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=9)
    modified_model = ModifiedBertModel(model, additional_attn_layers=num_layers)

    output_dir = f"./conll_checkpoints_{num_layers}_layers"
    return train_with_checkpointing("conll", modified_model, train_conll, eval_conll, output_dir)


In [None]:
# Initialize dictionaries to store accuracies for each task and layer type
results = {
    'sst2': {'attention': [], 'ff': [], 'embedding': []},
    'squad': {'attention': [], 'ff': [], 'embedding': []},
    'conll': {'attention': [], 'ff': [], 'embedding': []}
}

# Iterate over number of layers (0 to 3 layers)
for i in range(4):
    # SST-2 Task (Sentiment classification)
    results['sst2']['attention'].append(train_sst2_with_layers(num_layers=i))  # Self-Attention layers for SST-2
    results['sst2']['ff'].append(train_sst2_with_layers(num_layers=i))  # Feed-Forward layers for SST-2
    results['sst2']['embedding'].append(train_sst2_with_layers(num_layers=i))  # Embedding layers for SST-2

    # SQuAD Task (Question Answering)
    results['squad']['attention'].append(train_squad_with_layers(num_layers=i))  # Self-Attention layers for SQuAD
    results['squad']['ff'].append(train_squad_with_layers(num_layers=i))  # Feed-Forward layers for SQuAD
    results['squad']['embedding'].append(train_squad_with_layers(num_layers=i))  # Embedding layers for SQuAD

    # CoNLL-2003 Task (NER)
    results['conll']['attention'].append(train_conll_with_layers(num_layers=i))  # Self-Attention layers for CoNLL
    results['conll']['ff'].append(train_conll_with_layers(num_layers=i))  # Feed-Forward layers for CoNLL
    results['conll']['embedding'].append(train_conll_with_layers(num_layers=i))  # Embedding layers for CoNLL


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


In [None]:
import matplotlib.pyplot as plt

x = range(4)  # Number of layers (0 to 3)

# Create 9 separate plots
plt.figure(figsize=(20, 15))

# SST-2 (Sentiment Classification)
plt.subplot(3, 3, 1)
plt.plot(x, results['sst2']['attention'], marker='o')
plt.title('SST-2: Self-Attention Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.subplot(3, 3, 2)
plt.plot(x, results['sst2']['ff'], marker='o')
plt.title('SST-2: Feed-Forward Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.subplot(3, 3, 3)
plt.plot(x, results['sst2']['embedding'], marker='o')
plt.title('SST-2: Embedding Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

# SQuAD (Question Answering)
plt.subplot(3, 3, 4)
plt.plot(x, results['squad']['attention'], marker='o')
plt.title('SQuAD: Self-Attention Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.subplot(3, 3, 5)
plt.plot(x, results['squad']['ff'], marker='o')
plt.title('SQuAD: Feed-Forward Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.subplot(3, 3, 6)
plt.plot(x, results['squad']['embedding'], marker='o')
plt.title('SQuAD: Embedding Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

# CoNLL-2003 (NER)
plt.subplot(3, 3, 7)
plt.plot(x, results['conll']['attention'], marker='o')
plt.title('CoNLL-2003: Self-Attention Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.subplot(3, 3, 8)
plt.plot(x, results['conll']['ff'], marker='o')
plt.title('CoNLL-2003: Feed-Forward Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.subplot(3, 3, 9)
plt.plot(x, results['conll']['embedding'], marker='o')
plt.title('CoNLL-2003: Embedding Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.show()
