## Benchmark

In [None]:
import os

import torch

from llm_benchmarks.generation import generate_samples
from llm_benchmarks.utils import log_metrics_to_csv

os.environ["WANDB_SILENT"] = "true"

config = {
    "quantization_bits": "4bit",
    "torch_dtype": torch.float16,
    "temperature": 0.1,
}

model_names = [
    "decapoda-research/llama-7b-hf",
    # "decapoda-research/llama-13b-hf",
    # "decapoda-research/llama-30b-hf",
    # "decapoda-research/llama-65b-hf",
]

for model_name in model_names:
    config["model_name"] = model_name
    metrics = generate_samples(model_name, config, custom_token_counts=[512], llama=True)
    log_metrics_to_csv(model_name, config, metrics, "results512")

## Plot

In [None]:
from llm_benchmarks.plotting import plot_model_inference_speed

plot_model_inference_speed(
    model_name="llama",
    filters={},
    grouping_columns=["model_name", "quantization_bits"],
    colors={},
    title="LLaMA: Tokens per Second vs. Output Tokens",
    results_dir="./results512",
    save_path="./llama_compare_size_and_quant_inference.png",
    width=800,
    height=400,
)