In [None]:
import gc
from time import time

import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

models = ["chainyo/alpaca-lora-7b", "chavinlo/alpaca-13b"]

for model_name in models:
    # Load Model
    model = LlamaForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model.eval()
    model = torch.compile(model)

    # Tokenize inputs
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    text = "Question: Tell me a history of WW2 in 3 or 4 paragraphs.\nAnswer: "
    input_tokens = tokenizer(text, return_tensors="pt").input_ids.to("cuda")

    # Generate
    time0 = time()
    with torch.no_grad():
        output = model.generate(
            input_tokens,
            do_sample=True,
            temperature=0.9,
            max_length=1024,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    time1 = time()

    # Collect metrics
    gpu_mem_usage = torch.cuda.memory_allocated() / 1024**3
    output_tokens = output.cpu().numpy().tolist()[0]

    # Clear up memory
    del model
    gc.collect()
    torch.cuda.empty_cache()

    print(f"===== Model: {model_name} =====")
    print(f"Output tokens: {len(output_tokens)}")
    print(f"GPU memory usage: {gpu_mem_usage:.2f} GB")
    print(f"Time: {time() - time0:.2f} s")
    print(f"Tokens per second: {len(output_tokens) / (time1 - time0):.2f}")