In [8]:
import torch
from torch import nn
import torch.nn.utils.prune as prune
import pandas as pd

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from evaluate import evaluator
import evaluate

In [2]:
clf_metrics = evaluate.combine(["accuracy",
                                "f1",
                                "precision",
                                "recall"])

In [3]:
cuda_pipe = pipeline("text-classification",
                model="lvwerra/distilbert-imdb",
                device='cuda')
cpu_pipe = pipeline("text-classification",
                model="lvwerra/distilbert-imdb",
                device='cpu')

In [4]:
data = load_dataset("imdb",
                    split="test").shuffle().select(range(1000))

In [5]:
task_evaluator = evaluator("text-classification")

In [6]:
# NVIDIA GeForce RTX 3050 Ti Laptop GPU
cuda_results = task_evaluator.compute(model_or_pipeline=cuda_pipe,
                                 data=data,
                                 metric=clf_metrics,
                                 label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print('CUDA RESULTS')
print(cuda_results)

CUDA RESULTS
{'accuracy': 0.926, 'f1': 0.9274509803921569, 'precision': 0.9347826086956522, 'recall': 0.9202334630350194, 'total_time_in_seconds': 11.668627500010189, 'samples_per_second': 85.6998820126126, 'latency_in_seconds': 0.011668627500010189}


In [8]:
# Intel(R) Core(TM) i7-12700H 2.70 GHz
cpu_results = task_evaluator.compute(model_or_pipeline=cpu_pipe,
                                 data=data,
                                 metric=clf_metrics,
                                 label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print('CPU RESULTS')
print(cpu_results)

CPU RESULTS
{'accuracy': 0.931, 'f1': 0.9296636085626911, 'precision': 0.9382716049382716, 'recall': 0.9212121212121213, 'total_time_in_seconds': 135.31697280000026, 'samples_per_second': 7.390055950172705, 'latency_in_seconds': 0.13531697280000027}


The time performances can give useful indication on model speed for inference but should be taken with a grain of salt: they include all the processing that goes on in the pipeline. This may include tokenizing, post-processing, that may be different depending on the model. Furthermore, it depends a lot on the hardware you are running the evaluation on and you may be able to improve the performance by optimizing things like the batch size.

In [7]:
tokenizer = AutoTokenizer.from_pretrained("lvwerra/distilbert-imdb")
model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb").eval()

# Quantization

In [7]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 255.417MB


In [8]:
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

In [9]:
param_size = 0
for param in quantized_model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in quantized_model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('quantized_model size: {:.3f}MB'.format(size_all_mb))

quantized_model size: 91.000MB


In [10]:
pipe_q_cuda = pipeline("text-classification",
                  tokenizer=tokenizer,
                  model=quantized_model,
                  device='cpu')

In [11]:
next(quantized_model.parameters()).is_cuda # returns a boolean


False

In [53]:
# 3050
results = task_evaluator.compute(model_or_pipeline=pipe_q_cuda,
                                 data=data,
                                 metric=clf_metrics,
                                 label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)

In [None]:
print(results)

{'accuracy': 0.923, 'f1': 0.9258902791145331, 'precision': 0.8990654205607477, 'recall': 0.9543650793650794, 'total_time_in_seconds': 289.479546068, 'samples_per_second': 3.4544755012331536, 'latency_in_seconds': 0.28947954606799997}


In [None]:
all_res = {}
all_res['quant_gpu'] = {'accuracy': 0.923, 'f1': 0.9258902791145331, 'precision': 0.8990654205607477, 'recall': 0.9543650793650794, 'total_time_in_seconds': 289.479546068, 'samples_per_second': 3.4544755012331536, 'latency_in_seconds': 0.28947954606799997}

In [None]:
all_res['gpu'] = {'accuracy': 0.937, 'f1': 0.9379310344827586, 'precision': 0.9444444444444444, 'recall': 0.9315068493150684, 'total_time_in_seconds': 310.984303651, 'samples_per_second': 3.2155963766011904, 'latency_in_seconds': 0.310984303651}

In [None]:
all_res['cpu'] = {'accuracy': 0.929, 'f1': 0.9329556185080263, 'precision': 0.9481765834932822, 'recall': 0.9182156133828996, 'total_time_in_seconds': 526.6745037599999, 'samples_per_second': 1.8987059234135428, 'latency_in_seconds': 0.5266745037599999}

In [None]:

res_df = pd. DataFrame(data=all_res)
res_df

Unnamed: 0,quant_gpu,gpu,cpu
accuracy,0.923,0.937,0.929
f1,0.92589,0.937931,0.932956
precision,0.899065,0.944444,0.948177
recall,0.954365,0.931507,0.918216
total_time_in_seconds,289.479546,310.984304,526.674504
samples_per_second,3.454476,3.215596,1.898706
latency_in_seconds,0.28948,0.310984,0.526675


In [None]:
# CPU google colab
results = task_evaluator.compute(model_or_pipeline=pipe_q,
                                 data=data,
                                 metric=clf_metrics,
                                 label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)

In [None]:
all_res['quant_cpu'] = results
res_df = pd.DataFrame(data=all_res)
res_df

Unnamed: 0,quant_gpu,gpu,cpu,quant_cpu
accuracy,0.923,0.937,0.929,0.923
f1,0.92589,0.937931,0.932956,0.922613
precision,0.899065,0.944444,0.948177,0.907115
recall,0.954365,0.931507,0.918216,0.93865
total_time_in_seconds,289.479546,310.984304,526.674504,410.340658
samples_per_second,3.454476,3.215596,1.898706,2.437
latency_in_seconds,0.28948,0.310984,0.526675,0.410341


# Pruning

In [9]:
pruned_model = model

In [10]:
parameters_to_prune = tuple(((module, 'weight') for module in pruned_model.modules() if isinstance(module, nn.Linear)))

In [11]:
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2,
)

In [12]:
global_n_zero_el = 0
global_n_el = 0
for module, _ in parameters_to_prune:
    module_name = module._get_name()
    
    print(
        f"Sparsity in {module._get_name()}: " + "{:.2f}%".format(
            100. * float(torch.sum(module.weight == 0))
            / float(module.weight.nelement())
        )
    )
    global_n_zero_el += float(torch.sum(module.weight == 0))
    global_n_el += float(module.weight.nelement())
    prune.remove(module, 'weight')

print(
    "Global sparsity: {:.2f}%".format(
        100. * global_n_zero_el
        / global_n_el
    )
)

Sparsity in Linear: 19.51%
Sparsity in Linear: 20.09%
Sparsity in Linear: 24.09%
Sparsity in Linear: 24.76%
Sparsity in Linear: 20.04%
Sparsity in Linear: 20.28%
Sparsity in Linear: 15.74%
Sparsity in Linear: 16.12%
Sparsity in Linear: 23.41%
Sparsity in Linear: 24.56%
Sparsity in Linear: 18.90%
Sparsity in Linear: 20.34%
Sparsity in Linear: 17.44%
Sparsity in Linear: 17.22%
Sparsity in Linear: 19.85%
Sparsity in Linear: 21.90%
Sparsity in Linear: 18.70%
Sparsity in Linear: 20.26%
Sparsity in Linear: 16.84%
Sparsity in Linear: 16.63%
Sparsity in Linear: 18.33%
Sparsity in Linear: 19.79%
Sparsity in Linear: 19.41%
Sparsity in Linear: 21.29%
Sparsity in Linear: 16.67%
Sparsity in Linear: 16.74%
Sparsity in Linear: 17.62%
Sparsity in Linear: 18.58%
Sparsity in Linear: 19.54%
Sparsity in Linear: 20.40%
Sparsity in Linear: 16.73%
Sparsity in Linear: 16.81%
Sparsity in Linear: 17.50%
Sparsity in Linear: 18.08%
Sparsity in Linear: 20.47%
Sparsity in Linear: 21.94%
Sparsity in Linear: 38.64%
S

In [13]:
pipe_pruned_cuda = pipeline("text-classification",
                  tokenizer=tokenizer,
                  model=pruned_model,
                  device='cuda')

In [14]:
results = task_evaluator.compute(model_or_pipeline=pipe_pruned_cuda,
                                 data=data,
                                 metric=clf_metrics,
                                 label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)

In [15]:
results

{'accuracy': 0.928,
 'f1': 0.9287128712871286,
 'precision': 0.9455645161290323,
 'recall': 0.9124513618677043,
 'total_time_in_seconds': 11.126438500010408,
 'samples_per_second': 89.87601917712163,
 'latency_in_seconds': 0.011126438500010409}

In [16]:
pipe_pruned_cpu = pipeline("text-classification",
                  tokenizer=tokenizer,
                  model=pruned_model,
                  device='cpu')
results = task_evaluator.compute(model_or_pipeline=pipe_pruned_cpu,
                                 data=data,
                                 metric=clf_metrics,
                                 label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)

In [17]:
results

{'accuracy': 0.928,
 'f1': 0.9287128712871286,
 'precision': 0.9455645161290323,
 'recall': 0.9124513618677043,
 'total_time_in_seconds': 109.76259260001825,
 'samples_per_second': 9.110571974589398,
 'latency_in_seconds': 0.10976259260001826}