In [1]:
!pip install transformers datasets
!pip install xformers
!pip install evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

Inference Profiling for Original Model
======================================

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import time
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer

class MRPCDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        """
        Custom Dataset to wrap the MRPC dataset for DataLoader.
        :param dataset: The MRPC dataset (from `load_dataset`)
        :param tokenizer: The tokenizer used for text preprocessing
        """
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentence1 = self.dataset[idx]['sentence1']
        sentence2 = self.dataset[idx]['sentence2']
        inputs = self.tokenizer(sentence1, sentence2, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        # Flatten the tensors to remove extra dimensions
        return {key: val.squeeze(0) for key, val in inputs.items()}


def batch_inference(dataloader, model, device):
    inference_times = []
    all_predictions = []

    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():
        for batch in dataloader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            torch.cuda.synchronize()
            start_time = time.time()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True
            )

            # No Flash Attention applied, use original hidden states
            logits = model.classifier(outputs.hidden_states[-1].mean(dim=1).type(torch.float32))  # Use the mean of last hidden state
            probabilities = F.softmax(logits, dim=-1)

            torch.cuda.synchronize()
            end_time = time.time()
            batch_inference_time = end_time - start_time
            inference_times.append(batch_inference_time)

            all_predictions.extend(probabilities.cpu().numpy())

    performance_metrics = {
        'total_samples': len(dataloader.dataset),
        'batch_size': dataloader.batch_size,
        'device': str(device),
        'inference_times': inference_times,
        'avg_batch_latency_ms': np.mean(inference_times) * 1000,
        'std_batch_latency_ms': np.std(inference_times) * 1000,
        'avg_sample_latency_ms': (np.mean(inference_times) * 1000) / dataloader.batch_size
    }

    return all_predictions, performance_metrics



In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import time
from datasets import load_dataset
import numpy as np
import evaluate

# Load your model and tokenizer
model_name = "huawei-noah/TinyBERT_General_4L_312D"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to('cuda')

# Configuration
batch_sizes = [32, 64, 128, 512, 1024, 2048]
device = 'cuda'  # Change to 'cpu' if necessary

# Load and preprocess the MRPC dataset
task = "mrpc"
dataset = load_dataset("glue", task)


for batch_size in batch_sizes:
    print(f"\n--- Batch Size: {batch_size} ---")

    # Create DataLoader
    test_dataset = MRPCDataset(dataset['test'], tokenizer)
    dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Perform batch inference on test set
    all_predictions, performance_metrics = batch_inference(dataloader, model, device)

    # Print performance metrics
    print("\n--- Batch Inference Performance Metrics ---")
    for metric, value in performance_metrics.items():
        print(f"{metric}: {value}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Batch Size: 32 ---

--- Batch Inference Performance Metrics ---
total_samples: 1725
batch_size: 32
device: cuda
inference_times: [0.03616046905517578, 0.03551340103149414, 0.03551030158996582, 0.03546142578125, 0.031534671783447266, 0.029829025268554688, 0.029750585556030273, 0.02977156639099121, 0.029694795608520508, 0.0296630859375, 0.02968120574951172, 0.029727935791015625, 0.029737472534179688, 0.029682397842407227, 0.029760122299194336, 0.029741287231445312, 0.029847383499145508, 0.029850006103515625, 0.029752254486083984, 0.029748201370239258, 0.02969956398010254, 0.02971673011779785, 0.02974843978881836, 0.029688119888305664, 0.029742002487182617, 0.0297696590423584, 0.029810667037963867, 0.02974867820739746, 0.02971673011779785, 0.02971363067626953, 0.029715299606323242, 0.029752492904663086, 0.02972698211669922, 0.029729366302490234, 0.029745101928710938, 0.029776573181152344, 0.02969813346862793, 0.029720306396484375, 0.029687166213989258, 0.02976369857788086, 0.02977490

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.00 GiB. GPU 0 has a total capacity of 39.56 GiB of which 5.11 GiB is free. Process 11517 has 34.44 GiB memory in use. Of the allocated memory 24.11 GiB is allocated by PyTorch, and 9.83 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [9]:
torch.cuda.empty_cache()

In [15]:
import torch
import time
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import numpy as np
import evaluate
from torch.profiler import profile, record_function, ProfilerActivity

# Define MRPCDataset class and batch_inference function as before

def batch_inference_pytorch(dataloader, model, device):
    inference_times = []
    all_predictions = []

    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():
        # Start profiling
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler("./profiler_logs"),
                     record_shapes=True, with_stack=True) as prof:
            with record_function("model_inference"):
                for batch in dataloader:
                    # Move batch to device
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)

                    torch.cuda.synchronize()
                    start_time = time.time()

                    # Inference without Flash Attention
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask
                    )

                    logits = outputs.logits
                    probabilities = torch.nn.functional.softmax(logits, dim=-1)

                    torch.cuda.synchronize()
                    end_time = time.time()
                    batch_inference_time = end_time - start_time
                    inference_times.append(batch_inference_time)

                    all_predictions.extend(probabilities.cpu().numpy())

    print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))

    performance_metrics = {
        'total_samples': len(dataloader.dataset),
        'batch_size': dataloader.batch_size,
        'device': str(device),
        'inference_times': inference_times,
        'avg_batch_latency_ms': np.mean(inference_times) * 1000,
        'std_batch_latency_ms': np.std(inference_times) * 1000,
        'avg_sample_latency_ms': (np.mean(inference_times) * 1000) / dataloader.batch_size
    }

    return all_predictions, performance_metrics





print(f"\n--- Batch Size: 128 ---")

# Create DataLoader
test_dataset = MRPCDataset(dataset['test'], tokenizer)
dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Perform batch inference on test set
all_predictions, performance_metrics = batch_inference_pytorch(dataloader, model, device)

# Print performance metrics
print("\n--- Batch Inference Performance Metrics ---")



--- Batch Size: 128 ---
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        50.00%        1.670s        52.02%        1.737s     115.803ms       0.000us         0.00%       0.000us       0.000us            15  
                                  cudaDeviceSynchronize        43.44%        1.450s        43.44%        1.450s      50.017ms       0.000us         0.00%       0.000us       0.000us 

Original Model's Accuracy
=========================

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [49]:
def calculate_accuracy(dataloader, model, device):
    """
    Calculate accuracy of the model on a dataset provided by the DataLoader.
    :param dataloader: DataLoader providing batches of (inputs, labels)
    :param model: Pretrained model for evaluation
    :param device: Device ('cpu' or 'cuda')
    :return: Accuracy as a float
    """
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            # Extract inputs and labels from the batch
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            # Collect predictions and labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))
    return accuracy


In [53]:
# Tokenize dataset
def preprocess_data(example):
    return tokenizer(
        example['sentence1'], example['sentence2'],
        padding='max_length', truncation=True, max_length=512
    )
tokenized_dataset = dataset['validation'].map(preprocess_data, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=128)

In [54]:
# Calculate accuracy
validation_accuracy = calculate_accuracy(dataloader, model, device)
print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")

Validation Accuracy: 63.73%
