In [1]:
# !pip install ipywidgets
# !pip install xformers

In [2]:
from time import time
import gc
import torch
from tqdm import tqdm
from transformers import pipeline

models = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]

for model in models:
    print(f"===== Model: {model} =====")
    pipe = pipeline('text-generation', model=model, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto")
    # pipe.enable_xformers_memory_efficient_attention()

    time0 = time()
    text = "Question: Tell me a history of WW2 in 3 or 4 paragraphs.\nAnswer: "
    res = pipe(
        text,
        num_beams=5, 
        min_length=512, 
        max_length=1024, 
        pad_token_id=50256,
        # truncation=True
    )
    time1 = time()
    output_tokens = pipe.tokenizer.encode(res[0]["generated_text"])
    gpu_mem_usage = torch.cuda.memory_allocated() / 1024**3

    del pipe
    gc.collect()
    torch.cuda.empty_cache()

    print(f"Output tokens: {len(output_tokens)}")
    print(f"GPU memory usage: {gpu_mem_usage:.2f} GB")
    print(f"Time: {time() - time0:.2f} s")
    print(f"Tokens per second: {len(output_tokens) / (time1 - time0):.2f}")

===== Model: gpt2 =====
Output tokens: 1024
GPU memory usage: 0.26 GB
Time: 5.30 s
Tokens per second: 195.99
===== Model: gpt2-medium =====
Output tokens: 1024
GPU memory usage: 0.69 GB
Time: 9.60 s
Tokens per second: 107.89
===== Model: gpt2-large =====
Output tokens: 1024
GPU memory usage: 1.48 GB
Time: 15.01 s
Tokens per second: 69.23
===== Model: gpt2-xl =====
Output tokens: 1024
GPU memory usage: 3.02 GB
Time: 19.77 s
Tokens per second: 52.72
