In [None]:
import pathlib
from time import perf_counter

import numpy as np
import torch
from datasets import load_metric, load_dataset
from transformers import pipeline

In [None]:
ckpt = 'transformersbook/bert-base-uncased-finetuned-clinic'
pipe = pipeline('text-classification', model=ckpt)

In [None]:
class PerformanceBenchmark:

    def __init__(self, pipeline, dataset, optim_type='BERT baseline'):
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type
        self.score = load_metric('accuracy')

    def compute_accuracy(self, mapper):
        preds, labels = [], []
        for example in self.dataset:
            pred = self.pipeline(example['text'])[0]['label']
            label = example['intent']
            if mapper:
                preds.append(mapper(pred))
            else:
                preds.append(pred)
            labels.append(label)
        results = self.score.compute(predictions=preds, references=labels)
        print(f'Accuracy on test set: {results["accuracy"]:.3f}')
        return results

    def compute_size(self):
        state = self.pipeline.model.state_dict()
        fp = pathlib.Path('model.pt')
        torch.save(state, fp)
        mb = pathlib.Path(fp).stat().st_size / (1024**2)
        fp.unlink(missing_ok=True)
        print(f'Model size (MB): {mb:.2f}')
        return {'size_mb': mb}

    def time_pipeline(self, query='What is the pin number for my account?'):
        latencies = []
        for _ in range(100):
            start = perf_counter()
            _ = self.pipeline(query)
            latencies.append(perf_counter() - start)
        avg_ms = np.mean(latencies) * 1000
        std_ms = np.std(latencies) * 1000
        print(f'Average latency (ms): {avg_ms:.2f} +/- {std_ms:.2f}')
        return {'avg_ms': avg_ms, 'std_ms': std_ms}

    def run(self, query=None, mapper=None):
        if query is None:
            query='What is the pin number for my account?'
        metrics = {
            self.optim_type: {self.compute_size()}
        }
        metrics[self.optim_type].update(self.time_pipeline(query))
        metrics[self.optim_type].update(self.compute_accuracy(mapper))
        return metrics

In [None]:
data = load_dataset('clinic_oos', 'plus')
data['test'][42]

In [None]:
intents = data['test'].features['intent']
get_intents = lambda x: intents.int2str(x)  # mapper func

In [None]:
pb = PerformanceBenchmark(pipe, data['test'])
perf = pb.run(mapper=get_intents)