In [1]:
# !pip install SentencePiece

In [2]:
from time import time
import gc
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM


models = ["decapoda-research/llama-7b-hf", "decapoda-research/llama-13b-hf"]

for model_name in models:
    # Load Model
    model = LlamaForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model.eval()
    model = torch.compile(model)

    # Tokenize inputs
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    text = "Question: Tell me a history of WW2 in 3 or 4 paragraphs.\nAnswer: "
    input_tokens = tokenizer(text, return_tensors="pt").input_ids.to('cuda')

    # Generate
    time0 = time()
    with torch.no_grad():
        output = model.generate(
        input_tokens,
        do_sample=True,
        temperature=0.9,
        max_length=1024,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        )
    time1 = time()

    # Collect metrics
    gpu_mem_usage = torch.cuda.memory_allocated() / 1024**3
    output_tokens = output.cpu().numpy().tolist()[0]

    # Clear up memory
    del model
    gc.collect()
    torch.cuda.empty_cache()

    print(f"===== Model: {model_name} =====")
    print(f"Output tokens: {len(output_tokens)}")
    print(f"GPU memory usage: {gpu_mem_usage:.2f} GB")
    print(f"Time: {time() - time0:.2f} s")
    print(f"Tokens per second: {len(output_tokens) / (time1 - time0):.2f}") 


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/drose/miniconda3/envs/dolly2/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/drose/miniconda3/envs/dolly2/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/drose/miniconda3/envs/dolly2/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


===== Model: decapoda-research/llama-7b-hf =====
Output tokens: 1024
GPU memory usage: 6.69 GB
Time: 59.75 s
Tokens per second: 17.21


Loading checkpoint shards:   0%|          | 0/41 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


===== Model: decapoda-research/llama-13b-hf =====
Output tokens: 1024
GPU memory usage: 12.59 GB
Time: 72.93 s
Tokens per second: 14.11
