In [None]:
import time
import torch
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# model_id = '11mlabs/indri-0.1-124m-tts'
model_id = 'openai-community/gpt2'
# model_id = 'Qwen/Qwen2.5-Coder-7B-Instruct'
device = 'cuda:0'
llm = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa"
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
llm = torch.compile(llm)#, mode="reduce-overhead", fullgraph=True)

In [None]:
prompts = [
    "Hello, my name is indri",
    "The president of the United States is donald trump",
    "The capital of France is paris",
    "The future of AI is too much overhyped",
]
# p|rompts = ['write a program to solve fibonacci series']
# prompts = [f'[text]{p}[convert][mimi][spkr_52]' for p in prompts]

In [None]:
tokens = tokenizer(prompts[0], return_tensors='pt').to(device)

In [None]:
ipykernel_launcherfor _ in range(5):
    with torch.no_grad():
        o = llm.generate(**tokens, max_new_tokens=256)
        print(o.shape)

In [None]:
inference_times = []
out_tokens = []
gen_speed = []

for _ in tqdm(range(20)):
    with torch.no_grad():
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
    
        start_event.record()
        # start_time = time.time()
    
        out = llm.generate(**tokens, max_new_tokens=256)
    
        end_event.record()
        torch.cuda.synchronize()
    
        inference_time = start_event.elapsed_time(end_event) / 1000.0
        # inference_time = (time.time() - start_time)

        out_tokens.append(out.shape[-1] - tokens['input_ids'].shape[-1])
        inference_times.append(inference_time)
        gen_speed.append(out_tokens[-1] / inference_time)

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))

# Calculate averages
avg_out_tokens = np.mean(out_tokens)
avg_gen_speed = np.mean(gen_speed)

# Plot out_tokens on the first y-axis
color1 = 'tab:blue'
ax1.set_xlabel('Run Number')
ax1.set_ylabel('Generated Tokens', color=color1)
ax1.plot(out_tokens, color=color1, marker='o', label='Tokens')
ax1.axhline(y=avg_out_tokens, color=color1, linestyle='--', label='Avg Tokens')
ax1.tick_params(axis='y', labelcolor=color1)

# Create a second y-axis for generation speed
ax2 = ax1.twinx()
color2 = 'tab:red'
ax2.set_ylabel('Generation Speed (Tokens/Second)', color=color2)
ax2.plot(gen_speed, color=color2, marker='s', label='Speed')
ax2.axhline(y=avg_gen_speed, color=color2, linestyle='--', label='Avg Speed')
ax2.tick_params(axis='y', labelcolor=color2)

plt.title('Generated Tokens and Generation Speed')
fig.legend(loc='upper right', bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
plt.tight_layout()
plt.show()

Profiling

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

In [None]:
%%time
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    # record_shapes=True,
    # profile_memory=True,
    with_stack=True
) as prof:
    with record_function("model_inference"):
        with torch.no_grad():
            outputs = llm.generate(**tokens, max_new_tokens=256)

In [None]:
prof.export_chrome_trace("trace_coder7b_compile.json")

In [None]:
print('------------------------------------------------------------')
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))
print('------------------------------------------------------------')
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))