## Benchmark

In [None]:
import os

import torch

from llm_benchmarks.generation import generate_samples
from llm_benchmarks.utils import log_metrics_to_csv

os.environ["WANDB_SILENT"] = "true"

config = {
    "quantization_bits": "8bit",
    "torch_dtype": torch.float16,
    "temperature": 0.1,
}

model_names = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]


for model_name in model_names:
    config["model_name"] = model_name
    metrics = generate_samples(model_name, config)
    log_metrics_to_csv(model_name, config, metrics, "./results5")

## Plot

In [4]:
from llm_benchmarks.plotting import plot_model_inference_speed

plot_model_inference_speed(
    model_name="gpt2",
    filters={"tokens_per_second": (20, 1000)},
    grouping_columns=["model_name", "quantization_bits"],
    colors={},
    title="GPT-2: Tokens per Second vs. Output Tokens",
    save_path="./gpt2_compare_quantization_inference.png",
    results_dir="./results5",
    width=800,
    height=400,
)