In [1]:
# !pip install "accelerate>=0.16.0,<1" "transformers[torch]>=4.28.1,<5" "torch>=1.13.1,<2"
# !pip install --upgrade accelerate bitsandbytes torch
# !conda install cudatoolkit -y

In [3]:
from time import time
import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# autoreload
%load_ext autoreload
%autoreload 2

# from transformers import pipeline

models = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]

for model_name in models:
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        load_in_8bit=True, 
        torch_dtype=torch.float16, 
        device_map="auto"
    )
    text = "Question: Tell me a history of WW2 in 3 or 4 paragraphs.\nAnswer: "
    input_tokens = tokenizer(text, return_tensors="pt").input_ids.to('cuda')
    time0 = time()
    output = model.generate(
      input_tokens,
      do_sample=True,
      temperature=0.9,
      max_length=1024,
      pad_token_id=tokenizer.eos_token_id,
      eos_token_id=tokenizer.eos_token_id,
    )
    time1 = time()

    gpu_mem_usage = torch.cuda.memory_allocated() / 1024**3

    del model
    gc.collect()
    torch.cuda.empty_cache()

    output_tokens = output.cpu().numpy().tolist()[0]
    print(f"===== Model: {model_name} =====")
    print(f"Output tokens: {len(output_tokens)}")
    print(f"GPU memory usage: {gpu_mem_usage:.2f} GB")
    print(f"Time: {time() - time0:.2f} s")
    print(f"Tokens per second: {len(output_tokens) / (time1 - time0):.2f}") 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
===== Model: databricks/dolly-v2-3b =====
Output tokens: 1024
GPU memory usage: 3.03 GB
Time: 48.13 s
Tokens per second: 21.35
===== Model: databricks/dolly-v2-7b =====
Output tokens: 1024
GPU memory usage: 6.92 GB
Time: 56.49 s
Tokens per second: 18.21
===== Model: databricks/dolly-v2-12b =====
Output tokens: 1024
GPU memory usage: 11.70 GB
Time: 61.59 s
Tokens per second: 16.71
