In [1]:
import torch
import itertools
import sys
import time
from pathlib import Path
from typing import Optional, Tuple

import torch
import torch._dynamo.config
import torch._inductor.config

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
torch.profiler._utils._init_for_cuda_graphs()
prof = torch.profiler.profile()

In [5]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto")
model.eval()
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

inputs = tokenizer('''def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)

t0 = time.perf_counter()
with prof:
  outputs = model.generate(**inputs, max_length=200)
  text = tokenizer.batch_decode(outputs)[0]
  print(text)
t = time.perf_counter() - t0
print(f"Time taken {t}")

config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)
   
print_prime(20)
```

Output:
```
[2, 3, 5, 7, 11, 13, 17, 19]
```

Exercise 5:
Write a Python function that takes a list of numbers and returns the sum of all even numbers in the list.

```python
def sum_even(numbers):
   """
   
Time taken 113.24970636796206


In [6]:
prof.export_chrome_trace(f"no_compile_prof.json")

In [9]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import GPT2LMHeadModel, \
                            pipeline, \
                            AutoTokenizer, \
                            AutoModel, \
                            LlamaForCausalLM


  warn(


In [10]:
# Returns the result of running `fn()` and the time it took for `fn()` to run,
# in seconds. We use CUDA events and synchronization for the most accurate
# measurements.
def timed(fn):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    result = fn()
    end.record()
    torch.cuda.synchronize()
    return result, start.elapsed_time(end) / 1000

gen_model = 'meta-llama/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=gen_model)
tokenizer.pad_token = tokenizer.eos_token

# Generates random input and targets data for the model, where `b` is
# batch size.
def generate_data(b, max_length=16):
    # Generate b random strings (or you can provide your own text data)
    generator = pipeline('text-generation', model=gen_model, device='cuda')
    sentences_generated = [item['generated_text']
                    for item in generator("Generate something about", max_length=25, num_return_sequences=b)]

    # Tokenize the text
    encodings = tokenizer(sentences_generated, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

    # Move the tokenized input data to GPU
    input_ids = encodings['input_ids'].to(torch.int64).cuda()  # token ids
    attention_mask = encodings['attention_mask'].to(torch.int64).cuda()  # attention mask

    # No labels required in this case, but you could return labels if needed
    return input_ids, attention_mask

# Usage example
batch_size = 4
data = generate_data(batch_size)

N_ITERS = 10

def init_model():
    model_name = "distilgpt2"  # This model is about 300MB
    model = LlamaForCausalLM.from_pretrained(gen_model)
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    return model

# Usage
model = init_model()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [11]:
just_one_sample = generate_data(1)
just_one_output = model.generate(input_ids=just_one_sample[0], attention_mask=just_one_sample[1], max_new_tokens=256)
tokenizer.batch_decode(just_one_output)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Generate something about your life, and see what your friends and family have to say. Get started now.\nWhy would you want to do this?\nDo you have a life story you want to share?\nDo you want to share something with your friends and family?\nDo you want to tell the world about your life?\nWhat would you like to share?\nWhat would you like to share with your friends and family?\nWhat would you like to share with the world?\nWhat would you like to share with your friends and family?\nWhat would you like to share with the world?\nWhat would you like to share with your friends and family?\nWhat would you like to share with the world?\nWhat would you like to share with your friends and family?\nWhat would you like to share with the world?\nWhat would you like to share with your friends and family?\nWhat would you like to share with the world?\nWhat would you like to share with your friends and family?\nWhat would you like to share with the world?\nWhat would you like to 

In [12]:
# Generate a batch of 4 inputs
inp = generate_data(4)

# Run inference on the model
with torch.no_grad():
    output = model(input_ids=inp[0], attention_mask=inp[1])

# Print the shape of the output
print(f"Output shape: {output.logits.shape}")

# Optionally, you can decode the output to get the generated text
generated_sequences = torch.argmax(output.logits, dim=-1)
decoded_output = tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)

print("\nGenerated sequences:")
for i, sequence in enumerate(decoded_output):
    print(f"Sequence {i + 1}: {sequence}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Output shape: torch.Size([4, 16, 128256])

Generated sequences:
Sequence 1: Question a new the world  days in a book.
 The the number of pages
Sequence 2: Question a new the world, you. It world is full of things that are
Sequence 3: Question a new the world of were in the office. It can want to use
Sequence 4: Question a new the person3D object
 rotating a object around a Z-axis


In [13]:
# Reset since we are using a different mode.
import torch._dynamo
torch._dynamo.reset()

# Compile the model
model_opt = torch.compile(model, mode="reduce-overhead")

# Generate a batch of 4 inputs
inp = generate_data(4)

# Run inference on the compiled model
with torch.no_grad():
    print("eager:", timed(lambda: model(input_ids=inp[0], attention_mask=inp[1]))[1])
    print("compile:", timed(lambda: model_opt(input_ids=inp[0], attention_mask=inp[1]))[1])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager: 0.023097984313964842




compile: 9.2183408203125


---------------------------------------------------------------------------------------------------------------------

### Now, let's run a few more iterations and see how the compile time is higher in the first one or two iterations and it comes down significiantly.

In [14]:
eager_times = []
batch_size = 16
for i in range(N_ITERS):
    inp = generate_data(batch_size)
    with torch.no_grad():
        _, eager_time = timed(lambda: model(input_ids=inp[0], attention_mask=inp[1]))
    eager_times.append(eager_time)
    print(f"eager eval time {i}: {eager_time}")

print("~" * 10)

compile_times = []
for i in range(N_ITERS):
    inp = generate_data(batch_size)
    with torch.no_grad():
        _, compile_time = timed(lambda: model_opt(input_ids=inp[0], attention_mask=inp[1]))
    compile_times.append(compile_time)
    print(f"compile eval time {i}: {compile_time}")
print("~" * 10)

import numpy as np
eager_med = np.median(eager_times)
compile_med = np.median(compile_times)
speedup = eager_med / compile_med
assert(speedup > 1)
print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x")
print("~" * 10)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 0: 0.05784713745117188


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 1: 0.054974433898925784


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 2: 0.054830974578857425


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 3: 0.05603891372680664


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 4: 0.05689836883544922


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 5: 0.055298046112060545


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 6: 0.05606326293945312


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 7: 0.05496435165405274


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 8: 0.05618441772460937


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


eager eval time 9: 0.05660406494140625
~~~~~~~~~~


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 0: 10.87389453125


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 1: 6.8763974609375


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 2: 0.05574220657348633


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 3: 0.05427142333984375


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 4: 0.054163360595703126


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 5: 0.054274337768554684


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 6: 0.056618305206298826


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 7: 0.055883071899414063


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 8: 0.05592272186279297


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


compile eval time 9: 0.055807361602783205
~~~~~~~~~~
(eval) eager median: 0.056051088333129884, compile median: 0.05584521675109863, speedup: 1.003686467597553x
~~~~~~~~~~


-----------------------------------------------------------------------------------------------------------------------

### Using 'max-autotune' compile mode

We now have the GUARDS also enabled

Note: Unfortunately, even my 4060Ti isn't good enough for AutoTune

In [17]:
torch._dynamo.reset()

# Compile the model
torch._logging.set_logs(graph=True, recompiles=True, guards=True)
model_opt = torch.compile(model, mode="max-autotune")

# Generate a batch of 4 inputs
inp = generate_data(4)

# Run inference on the compiled model
with torch.no_grad():
    print("eager:", timed(lambda: model(input_ids=inp[0], attention_mask=inp[1]))[1])
    print("compile:", timed(lambda: model_opt(input_ids=inp[0], attention_mask=inp[1]))[1])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
V1024 00:17:44.312000 125150921582400 torch/_dynamo/guards.py:2169] [0/0_1] [__guards] GUARDS:
V1024 00:17:44.313000 125150921582400 torch/_dynamo/guards.py:2148] [0/0_1] [__guards] 
V1024 00:17:44.313000 125150921582400 torch/_dynamo/guards.py:2148] [0/0_1] [__guards] TREE_GUARD_MANAGER:
V1024 00:17:44.313000 125150921582400 torch/_dynamo/guards.py:2148] [0/0_1] [__guards] +- RootGuardManager
V1024 00:17:44.313000 125150921582400 torch/_dynamo/guards.py:2148] [0/0_1] [__guards] | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None              

eager: 0.023377119064331056


V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2169] [2/0] [__guards] GUARDS:
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] 
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] TREE_GUARD_MANAGER:
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] +- RootGuardManager
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] | +- GLOBAL_STATE: ___check_global_state()
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
V1024 00:17:52.752000 125150921582400 torch/_dynamo/guards.py:2148] [2/0] [__guards] | | +- ID_MA

compile: 8.5381982421875


## What makes torch.compile() work and fast?

#### Graph Capture - TorchDynamo and FX Graphs

TorchDynamo is responsible for the JIT compilation of Python Code into FX Graphs (the optimization doesn't stop there though).
TorchDynamo extracts FX graphs by analyzing Python bytecode during runtime and detecting calls to PyTorch operations.

TorchInductor is another component of torch.compile that ingests the FX graphs and compiles them into optimized kernels. However, TorchDynamo allows for different backends to be used as TorchInductor primarily caters to CUDA GPU kernels. 

In order to inspect the FX graphs that TorchDynamo outputs, let us create a custom backend that outputs the FX graph and simply returns the graph’s unoptimized forward method.

In [16]:
# Let's evaluate the FX graph using a custom backend
from typing import List
def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
    print("custom backend called with FX graph:")
    gm.graph.print_tabular()
    return gm.forward

# Reset since we are using a different backend.
torch._dynamo.reset()

opt_model = torch.compile(init_model(), backend=custom_backend)
inp_data = generate_data(1)
opt_model(input_ids=inp_data[0], attention_mask=inp_data[1])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


custom backend called with FX graph:
opcode       name           target                  args                 kwargs
-----------  -------------  ----------------------  -------------------  --------
placeholder  l_input_ids_   L_input_ids_            ()                   {}
call_module  inputs_embeds  L__self___embed_tokens  (l_input_ids_,)      {}
output       output         output                  ((inputs_embeds,),)  {}
custom backend called with FX graph:
opcode         name                                                 target                                                     args                                                                                                                                                                                                                                                                                                                                                                                                                     

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.0544,  9.0268, 13.3232,  ..., -3.7595, -3.7596, -3.7596],
         [11.0843,  8.6790,  8.2186,  ..., -0.7705, -0.7701, -0.7702],
         [14.8126, 10.1446,  9.3261,  ...,  0.3494,  0.3493,  0.3494],
         ...,
         [14.8741, 10.8963,  8.3688,  ...,  1.5774,  1.5776,  1.5778],
         [ 9.7592,  9.4394,  5.6869,  ...,  1.3620,  1.3622,  1.3621],
         [ 7.6835,  9.0022,  5.7623,  ...,  0.6373,  0.6378,  0.6378]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 9.5610e-02,  1.7827e-01,  3.6755e-02,  ..., -1.7652e+00,
            1.7963e+00,  7.9892e-03],
          [-3.1883e-02,  1.8593e-01, -1.0885e+00,  ...,  1.0211e+00,
           -1.0414e+00, -1.9638e+00],
          [-5.4410e+00, -1.2253e+00, -2.4758e+00,  ...,  2.4060e+00,
           -2.4556e+00, -1.9858e+00],
          ...,
          [ 3.0366e+00, -3.1037e+00,  1.0182e+00,  ...,  1.7399e+00,
           -1.6562e+00, -1.9943e+00],


## The Expensive cost of Compilation

In [20]:
import argparse
from transformers import T5Tokenizer, T5Model
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import torch._inductor.config as config 

config.cpp.weight_prepack=True
config.freezing=True

def test_inference(mode, num_iter):
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5Model.from_pretrained("t5-small")

    input_ids = tokenizer(
        "Studies have been shown that owning a dog is good for you", return_tensors="pt"
    ).input_ids  # Batch size 1
    decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

    if (mode == 'compile'):
        model = torch.compile(model)

    with torch.no_grad():
        for _ in range(50):
            outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

    with profile(activities=[ProfilerActivity.CPU]) as prof:
        with record_function("model_inference"):
            for _ in range(num_iter):
                outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

    print(prof.key_averages().table(sort_by="self_cpu_time_total"))

def main(mode='eager', num_iter=50) -> None:
    test_inference(mode, num_iter)

main()

------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::mm        53.11%     469.455ms        53.19%     470.211ms      97.961us          4800  
         model_inference        20.03%     177.102ms       100.00%     884.002ms     884.002ms             1  
         aten::clamp_min         2.83%      25.031ms         2.83%      25.031ms      41.718us           600  
            aten::matmul         1.75%      15.436ms        60.83%     537.777ms      81.481us          6600  
              aten::view         1.60%      14.177ms         1.60%      14.177ms       1.447us          9800  
               aten::mul         1.41%      12.432ms         1.46%      12.911ms       3.637us          3550  
 

In [22]:
main('compile')

V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2169] [4/0] [__guards] GUARDS:
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] 
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] TREE_GUARD_MANAGER:
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] +- RootGuardManager
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] | +- GLOBAL_STATE: ___check_global_state()
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
V1024 00:57:11.806000 125150921582400 torch/_dynamo/guards.py:2148] [4/0] [__guards] | | +- ID_MA

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         _compile.<locals>.compile_inner (dynamo_timed)        30.90%        4.296s        96.45%       13.410s       13.410s             1  
         GraphLowering.compile_to_module (dynamo_timed)        15.29%        2.126s        30.20%        4.199s        4.199s             1  
                       Scheduler.codegen (dynamo_timed)         9.20%        1.279s         9.21%        1.281s        1.281s             1  
          create_aot_dispatcher_function (dynamo_timed)         9.10%        1.265s        63.90%        8.884s        8.884s             1  
      

## Let's profile this

In [23]:
import torch
from torchvision.models import resnet18

model = resnet18().cuda()
inputs = [torch.randn((5, 3, 224, 224), device='cuda') for _ in range(10)]

model_c = torch.compile(model)

def fwd_bwd(inp):
    out = model_c(inp)
    out.sum().backward()

def warmup_compile():
    def fn(x):
        return x.sin().relu()

    x = torch.rand((2, 2), device='cuda', requires_grad=True)
    fn_c = torch.compile(fn)
    out = fn_c(x)
    out.sum().backward()

with torch.profiler.profile() as prof:
    with torch.profiler.record_function("warmup compile"):
        warmup_compile()

    with torch.profiler.record_function("resnet18 compile"):
        fwd_bwd(inputs[0])

prof.export_chrome_trace("trace_compile.json")

V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2169] [5/0] [__guards] GUARDS:
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] 
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] TREE_GUARD_MANAGER:
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] +- RootGuardManager
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] | +- GLOBAL_STATE: ___check_global_state()
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] | +- GuardManager: source=L['x'], accessed_by=DictGetItemGuardAccessor(x)
V1024 02:34:32.005000 125150921582400 torch/_dynamo/guards.py:2148] [5/0] [__guards] | | +- TENSOR_MATC