## Benchmark

In [None]:
import os

import torch

from llm_benchmarks.generation import generate_samples
from llm_benchmarks.utils import log_metrics_to_csv

os.environ["WANDB_SILENT"] = "true"

config = {
    "load_in_8bit": True,
    "torch_dtype": torch.float16,
    "temperature": 0.1,
    "min_length": 50,
    "max_length": 700,
    "try_different_lengths": True,
}

model_names = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
for model_name in model_names:
    config["model_name"] = model_name
    metrics = generate_samples(model_name, config, 20)
    log_metrics_to_csv(model_name, config, metrics, "results")

## Plot

In [3]:
from llm_benchmarks.plotting import plot_model_inference_speed

plot_model_inference_speed(
    model_name="dolly",
    filters={},
    grouping_columns=["model_name", "quantization_bits"],
    colors={},
    title="Dolly-2: Tokens per Second vs. Output Tokens",
    results_dir="./results3",
    save_path="./dolly2_compare_size_and_quant_inference.png",
)