## Benchmark

In [None]:
import os

import torch

from llm_benchmarks.generation import generate_samples
from llm_benchmarks.utils import log_metrics_to_csv

os.environ["WANDB_SILENT"] = "true"

config = {
    "quantization_bits": "8bit",
    "torch_dtype": torch.float16,
    "temperature": 0.1,
    "min_length": 32,
    "max_length": 1024,
    "try_different_lengths": True,
}

model_names = [
    "tiiuae/falcon-7b",
    "tiiuae/falcon-40b",
]

for model_name in model_names:
    config["model_name"] = model_name
    metrics = generate_samples(model_name, config, 10)
    log_metrics_to_csv(model_name, config, metrics, "./results3")

## Plot

In [2]:
from llm_benchmarks.plotting import plot_model_inference_speed

plot_model_inference_speed(
    model_name="falcon",
    filters={"tokens_per_second": (0, 100)},
    grouping_columns=["model_name", "quantization_bits"],
    colors={},
    title="Falcon 7B: Tokens per Second vs. Output Tokens",
    save_path="./falcon_compare_quantization_inference.png",
    results_dir="./results3",
    width=800,
    height=400,
)