# **Profiling**

In [None]:
import torch
print(torch.__version__) #2.6.0+cu124
print(torch.version.cuda) #12.4
print(torch._C._GLIBCXX_USE_CXX11_ABI) #false, 0
#import flash_attn
#flash_attn.__version__


2.6.0+cu124
12.4
False


In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive


Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%cd /content/drive/MyDrive/colab_notebooks

/content/drive/MyDrive/colab_notebooks


In [None]:
# this doesnt work for nsys but works for pytorch profiler
import torch
import torch.profiler
import time

# Create a large tensor on GPU
N, M = 8192, 8192  # rows, columns
x = torch.randn(N, M, device="cuda")

# Function: coalesced access (row-wise sum)
def coalesced():
    return x.sum(dim=1)

# Function: non-coalesced access (column-wise sum)
# is this correct? this only means different stride not memory layout
def non_coalesced():
    return x.sum(dim=0)

# Profile both using torch.profiler
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True,
    with_stack=False,
    with_flops=True,
    profile_memory=True,
) as prof:
    for _ in range(1000):
        coalesced()
        torch.cuda.synchronize()
    for _ in range(1000):
        non_coalesced()
        torch.cuda.synchronize()

# Print top memory / time events
print(prof.key_averages().table())

In [None]:
# the nsys command options dont work with this version
import torch
from torch.cuda import nvtx

torch.manual_seed(0)
device = "cuda"

# Large matrix (row-major / C-contiguous)
N, M = 8192, 8192
x = torch.randn(N, M, device=device)

def contiguous_row_sum():
    """Fast: contiguous memory, coalesced loads."""
    nvtx.range_push("contiguous_row_sum")
    y = x.sum(dim=1)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

def noncontiguous_row_sum():
    """Slow: operating on a transposed view (stride > 1)."""
    nvtx.range_push("noncontig_row_sum")
    y = x.t().sum(dim=1)           # x.t() is (8192,8192) with stride (1,8192)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

# Warm-up kernels & GPU
contiguous_row_sum(); noncontiguous_row_sum()

# Run a handful of iterations so Nsight Systems has data
for _ in range(10):
    contiguous_row_sum()
for _ in range(10):
    noncontiguous_row_sum()


In [None]:
#demo.py
import torch
from torch.cuda import nvtx

torch.manual_seed(0)
device = "cuda"

# Large matrix (row-major / C-contiguous)
N, M = 8192, 8192
x = torch.randn(N, M, device=device)

def contiguous_row_sum():
    """Fast: contiguous memory, coalesced loads."""
    nvtx.range_push("contiguous_row_sum")
    y = x.sum(dim=1)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

def noncontiguous_row_sum():
    """Slow: operating on a transposed view (stride > 1)."""
    nvtx.range_push("noncontig_row_sum")
    y = x.t().sum(dim=1)           # x.t() is (8192,8192) with stride (1,8192)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

# Warm-up kernels & GPU
contiguous_row_sum(); noncontiguous_row_sum()

# Run a handful of iterations so Nsight Systems has data
for _ in range(10):
    contiguous_row_sum()
for _ in range(10):
    noncontiguous_row_sum()

In [None]:
#demo.py
import torch
from torch.cuda import nvtx

torch.manual_seed(0)
device = "cuda"

# Large matrix (row-major / C-contiguous)
N, M = 8192, 8192
x = torch.randn(N, M, device=device)

def contiguous_row_sum():
    """Fast: contiguous memory, coalesced loads."""
    nvtx.range_push("contiguous_row_sum")
    y = x.sum(dim=1)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

def noncontiguous_row_sum():
    """Slow: operating on a transposed view (stride > 1)."""
    nvtx.range_push("noncontig_row_sum")
    y = x.t().sum(dim=1)           # x.t() is (8192,8192) with stride (1,8192)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

# Warm-up kernels & GPU
contiguous_row_sum(); noncontiguous_row_sum()

# Run a handful of iterations so Nsight Systems has data
for _ in range(10):
    contiguous_row_sum()
for _ in range(10):
    noncontiguous_row_sum()

Make sure the nsys version on macos viewer matches the one collecting teh data on colab/GPU.

In [None]:
!wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_2/NsightSystems-linux-public-2025.2.1.130-3569061.run
!chmod +x NsightSystems-linux-public-2025.2.1.130-3569061.run
!./NsightSystems-linux-public-2025.2.1.130-3569061.run

--2025-07-10 15:45:09--  https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_2/NsightSystems-linux-public-2025.2.1.130-3569061.run
Resolving developer.nvidia.com (developer.nvidia.com)... 23.195.37.69, 23.195.37.70
Connecting to developer.nvidia.com (developer.nvidia.com)|23.195.37.69|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://developer.download.nvidia.com/assets/tools/secure/nsight-systems/2025_2/NsightSystems-linux-public-2025.2.1.130-3569061.run?__token__=exp=1752162910~hmac=71f53bfe8cd53396fe6df0a4fcd72ccddce4403b7eeebe34575f90ae44b36f49 [following]
--2025-07-10 15:45:10--  https://developer.download.nvidia.com/assets/tools/secure/nsight-systems/2025_2/NsightSystems-linux-public-2025.2.1.130-3569061.run?__token__=exp=1752162910~hmac=71f53bfe8cd53396fe6df0a4fcd72ccddce4403b7eeebe34575f90ae44b36f49
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.212.249.133, 23.212.249.151

In [None]:
#demo.py
import torch
from torch.cuda import nvtx

torch.manual_seed(0)
device = "cuda"

# Large matrix (row-major / C-contiguous)
N, M = 8192, 8192
x = torch.randn(N, M, device=device)

def contiguous_row_sum():
    """Fast: contiguous memory, coalesced loads."""
    nvtx.range_push("contiguous_row_sum")
    y = x.sum(dim=1)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

def noncontiguous_row_sum():
    """Slow: operating on a transposed view (stride > 1)."""
    nvtx.range_push("noncontig_row_sum")
    y = x.t().sum(dim=1)           # x.t() is (8192,8192) with stride (1,8192)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

# Warm-up kernels & GPU
contiguous_row_sum(); noncontiguous_row_sum()

# Run a handful of iterations so Nsight Systems has data
for _ in range(10):
    contiguous_row_sum()
for _ in range(10):
    noncontiguous_row_sum()

In [None]:
RUN FROM TERMINAL!!!!
# run from command line in terminal
#license agreement wont show up in jupyter cell
!./NsightSystems-linux-public-2025.3.1.90-3582212.run

Verifying archive integrity...     0%     1%     2%     3%     4%     5%     6%     7%     8%     9%    10%    11%    12%    13%    14%    15%    16%    17%    18%    19%    20%    21%    22%    23%    24%    25%    26%    27%    28%    29%    30%    31%    32%    33%    34%    35%    36%    37%    38%    39%    40%    41%    42%    43%    44%    45%    46%    47%    48%    49%    50%    51%    52%    53%    54%    55%    56%    57%    58%    59%    60%    61%    62%    63%    64%    65%    66%    67%    68%    

In [None]:
! ls /opt/nvidia/nsight-systems/2025.2.1/bin/

nsys  nsys-ui


In [None]:
!apt-get install emacs

shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
emacs is already the newest version (1:27.1+1-3ubuntu5.2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


**NSYS**


*   Download nsys deb into directory, install and run with full path
*   fix cuda-tookit to 12.4 for google colab, nvcc --version must match nvidia-smi




In [None]:
#faster
!apt-get -y install cuda-toolkit-12-4
!rm /etc/alternatives/cuda
!ln -s  /usr/local/cuda-12.4 /etc/alternatives/cuda


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  cuda-cccl-12-4 cuda-command-line-tools-12-4 cuda-compiler-12-4 cuda-crt-12-4
  cuda-cudart-12-4 cuda-cudart-dev-12-4 cuda-cuobjdump-12-4 cuda-cupti-12-4
  cuda-cupti-dev-12-4 cuda-cuxxfilt-12-4 cuda-documentation-12-4
  cuda-driver-dev-12-4 cuda-gdb-12-4 cuda-libraries-12-4
  cuda-libraries-dev-12-4 cuda-nsight-12-4 cuda-nsight-compute-12-4
  cuda-nsight-systems-12-4 cuda-nvcc-12-4 cuda-nvdisasm-12-4
  cuda-nvml-dev-12-4 cuda-nvprof-12-4 cuda-nvprune-12-4 cuda-nvrtc-12-4
  cuda-nvrtc-dev-12-4 cuda-nvtx-12-4 cuda-nvvm-12-4 cuda-nvvp-12-4
  cuda-opencl-12-4 cuda-opencl-dev-12-4 cuda-profiler-api-12-4
  cuda-sanitizer-12-4 cuda-toolkit-12-4-config-common cuda-tools-12-4
  cuda-visual-tools-12-4 default-jre default-jre-headless fonts-dejavu-core
  fonts-dejavu-extra gds-tools-12-4 libatk-wrapper-java
  libatk-wrapper-java-jni libcublas-12-4

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#!apt-get install emacs
%cd /content/drive/MyDrive/colab_notebooks

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/colab_notebooks


In [None]:
!chmod +x NsightSystems-linux-public-2025.2.1.130-3569061.run

In [None]:
!./NsightSystems-linux-public-2025.2.1.130-3569061.run --quiet --accept

Press <Enter> or <Return> to read end user license agreement.
Signal caught, cleaning up


In [None]:
!/opt/nvidia/nsight-systems/2023.4.4/bin/nsys profile python mlp.py

In [None]:
!/opt/nvidia/nsight-systems/2023.4.4/bin/nsys export --type json --output mlp.json report2.nsys-rep



In [None]:
#demo.py
import torch
from torch.cuda import nvtx

torch.manual_seed(0)
device = "cuda"

# Large matrix (row-major / C-contiguous)
N, M = 8192, 8192
x = torch.randn(N, M, device=device)

def contiguous_row_sum():
    """Fast: contiguous memory, coalesced loads."""
    nvtx.range_push("contiguous_row_sum")
    y = x.sum(dim=1)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

def noncontiguous_row_sum():
    """Slow: operating on a transposed view (stride > 1)."""
    nvtx.range_push("noncontig_row_sum")
    y = x.t().sum(dim=1)           # x.t() is (8192,8192) with stride (1,8192)
    torch.cuda.synchronize()
    nvtx.range_pop()
    return y

# Warm-up kernels & GPU
contiguous_row_sum(); noncontiguous_row_sum()

# Run a handful of iterations so Nsight Systems has data
for _ in range(10):
    contiguous_row_sum()
for _ in range(10):
    noncontiguous_row_sum()

In [None]:
!/opt/nvidia/nsight-systems/2025.2.1/bin/nsys profile  python demo.py


In [None]:
# none of the flags work
# --output
#
#

!/opt/nvidia/nsight-systems/2025.3.1/bin/nsys profile    python demo.py


Collecting data...
Generating '/tmp/nsys-report-a8c2.qdstrm'
Generated:
	/content/report1.nsys-rep


In [None]:
https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_2/NsightSystems-linux-public-2025.2.1.130-3569061.run

Cant use the file, version doesnt match between macos install and this install


muplitle subsribers mean both pytorch profiler and nsys are colliding

WARNING:2025-07-05 22:25:03 62230:62230 init.cpp:180] function cbapi->getCuptiStatus() failed with error CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED (39)
WARNING:2025-07-05 22:25:03 62230:62230 init.cpp:181] CUPTI initialization failed - CUDA profiler activities will be missing
INFO:2025-07-05 22:25:03 62230:62230 init.cpp:183] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  
           aten::sum        99.68%      17.323ms       100.00%      17.379ms     868.965us            20  
    aten::as_strided         0.32%      56.365us         0.32%      56.365us       2.818us            20  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 17.379ms

Processing events...
Generated:
        No reports were generated
/content#

# pass 1 – timeline
nsys profile -o timeline --capture-range=cudaProfilerApi \
    python train.py

pass1 has error still
/content# /opt/nvidia/nsight-systems/2025.3.1/bin/nsys profile -o timeline
--capture-range=cudaProfilerApi python demo.py
WARNING:2025-07-05 22:29:09 63369:63369 init.cpp:180] function cbapi->getCuptiStatus() failed with error CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED (39)
WARNING:2025-07-05 22:29:09 63369:63369 init.cpp:181] CUPTI initialization failed - CUDA profiler activities will be missing
INFO:2025-07-05 22:29:09 63369:63369 init.cpp:183] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  
           aten::sum        99.63%      16.887ms       100.00%      16.950ms     847.488us            20  
    aten::as_strided         0.37%      62.887us         0.37%      62.887us       3.144us            20  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 16.950ms

Processing events...
Generated:
        No reports were generated




# pass 2 – kernel metrics
ncu --set full --nvtx -o kernels python train.py

In [None]:
#https://github.com/stanford-cs336/spring2025-lectures/blob/main/lecture_06_mlp.py
# makes sure this runs first. if saved as demo.py python demo.py
# should see no error messages
# after running wo errors, run w nsys.

import torch
import torch.nn as nn
from typing import Callable
import torch.cuda.nvtx as nvtx

class MLP(nn.Module):
  """ MLP"""
  def __init__(self, dim:int, num_layers:int):
    super().__init__()
    self.layers = nn.ModuleList([nn.Linear(dim, dim) for _ in range(num_layers)])

  def forward(self, x:torch.Tensor):
    for i, layer in enumerate(self.layers):
      with nvtx.range(f"layer_{i}"):
        x = layer(x)
        x = torch.nn.functional.gelu(x)
    return x

def get_device():
  if torch.cuda.is_available():
    return "cuda"
  else:
    return "cpu"


def run_mlp(dim:int, num_layers:int, batch_size:int, num_steps:int, use_optimizer: bool = False )->Callable:
  """Run forward and backward passes through an MLP.

    Args:
        dim: Dimension of each layer
        num_layers: Number of linear+GeLU layers
        batch_size: Number of samples to process at once
        num_steps: Number of forward/backward iterations
        use_optimizer: Whether to use Adam optimizer for weight updates
  """
  with nvtx.range("define_model"):
    model = MLP(dim, num_layers).to(get_device())

  optimizer = torch.optim.Adam(model.parameters()) if use_optimizer else None
  with nvtx.range("define_input"):
    x = torch.randn( batch_size, dim, device=get_device())

  for step in range(num_steps):
    if step > 10:
      # start profiling after 10 warmup iterations
      torch.cuda.cudart().cudaProfilerStart()

    nvtx.range_push(f"step_{step}")

    # Zero gradients
    if use_optimizer:
      optimizer.zero_grad()
    else:
      model.zero_grad(set_to_none=True)

    # Forward
    with nvtx.range("forward"):
      y = model(x).mean()

    # Backward
    with nvtx.range("backward"):
      y.backward()
    # Optimizer step if enabled
    if use_optimizer:
      with nvtx.range("optimizer_step"):
        #print(f"Step {step}, loss: {y.item():.6f}")
        optimizer.step()

    nvtx.range_pop()

def main():
  # Run a larger model if GPU is available
  if torch.cuda.is_available():
    print("Running on GPU")
    #4096 wont run on a100 move to 2048 or 1024
    run_mlp(dim=4096, num_layers=64, batch_size=1024, num_steps=15, use_optimizer=True)
  else:
    print("Running on CPU")
    run_mlp(dim=128, num_layers=16, batch_size=128, num_steps=15, use_optimizer=True)

if __name__ == "__main__":
    main()




Running on GPU


Install profiler
cd /content/drive/MyDrive/colab_notebook
chmod +x  NsightSystems-linux-public-2025.2.1.130-3569061.run

./NsightSystems-linux-public-2025.2.1.130-3569061.run

SCROLL THROUGH response and type ACCEPT

.......

Run profiler
/opt/nsys/ profile python demo.py

> Add blockquote




In [None]:
def run_operation1(dim:int, operation: Callable)->Callable:
  # Setup: create one random dim x dim matrix
  x = torch.randn(dim, dim, device=get_device())
  return lambda: operation(x)


def run_operation2(dim:int, operation:Callable)-> Callable:
  #SEtup: create 2 random dimxdim entries
  x = torch.randn(dim, dim, device = get_device())
  y = torch.randn(dim, dim, device=get_device())
  return lambda: operation(x,y)



In [None]:
from typing import Callable
import profile
import time

def benchmark(description:str, run: Callable, num_warmups: int=1, num_trials: int=3):
  # fill pipeline, warmup
  print("benchmark warmup")
  for _ in range(num_warmups):
    run()

  if torch.cuda.is_available():
    torch.cuda.synchronize()

  #timing cuda
  print("timing cuda")
  times:list[float] = []
  for trial in range(num_trials):
    start_time = time.time()

    run()
    if torch.cuda.is_available():
      print("cuda available!")
      torch.cuda.synchronize()
    end_time = time.time()
    times.append((end_time-start_time)*1000)

  mean_time = sum(times)/len(times)

  return mean_time


def profile(description:str, run: Callable, num_warmups:int=1, with_stack: bool = False):
  # warmup
  for _ in range(num_warmups):
    run()
  if torch.cuda.is_available():
    torch.cuda.synchronize()

  with torch.profiler.profile(
      activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
      #output stack trace for visualization
      with_stack = with_stack,
      #needed to export stack trace for viz
      experimental_config = torch._C._profiler._ExperimentalConfig(verbose=True)) as prof:
      run()
      if torch.cuda.is_available():
        torch.cuda.synchronize()

  table = prof.key_averages().table(sort_by="cuda_time_total", max_name_column_width=80, row_limit=10)

  print(f"table:{table}")

  if with_stack:
    text_path = f"var/stacks_(description).txt"
    svg_path = f"var/stacks_(description).svg"
    prof.export_stacks(text_path, "self_cuda_time_total")
  return table



def profiling():
  print("running fn profiling...")
  sleep_function = lambda : time.sleep(50 / 1000)
  sleep_profile = profile("sleep", sleep_function)
profiling()

running fn profiling...
table:-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    cudaDeviceSynchronize       100.00%      21.902us       100.00%      21.902us      10.951us             2  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 21.902us



In [None]:
#profile the sleep should see cuda output Is this cpu?
benchmark("sleep", lambda:time.sleep(50/1000))

benchmark warmup
timing cuda
cuda available!
cuda available!
cuda available!


50.3235658009847

Add Function

In [None]:
add_function = lambda a, b: a + b
add_profile = profile("add profile", run_operation2(dim=2048, operation=add_function))



table:--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                       aten::add        92.90%       2.133ms        94.05%       2.159ms       2.159ms     193.567us       100.00%     193.567us     193.567us             1  
void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add...         0.00%       

In [None]:
matmul_function = lambda a,b : a @ b
matmul_profile = profile("matmul", run_operation2(dim=2048, operation = matmul_function))

table:-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     aten::matmul         0.15%      13.726us        33.15%       2.949ms       2.949ms       0.000us         0.00%       6.024ms       6.024ms             1  
                                         aten::mm        24.96%       2.220ms        33.00%       2.935ms       2.935ms       6.024ms       100.00%       6.024ms       6.024ms             1  
                            volta

In [None]:
matmul_function_128 = lambda a,b : a @ b
matmul_profile_128 = profile("matmul_128_different_kernel(dim=128)", run_operation2(dim=128, operation=matmul_function_128))

table:----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::matmul         0.45%      10.230us        99.66%       2.279ms       2.279ms       0.000us         0.00%      14.432us      14.432us             1  
                          aten::mm        71.00%       1.624ms        99.21%       2.269ms       2.269ms      14.432us       100.00%      14.432us      14.432us             1  
    volta_sgemm_32x32_sliced1x4_nn         0.00%       0.000us         0.00%       0.000us       0.000us    

In [None]:
cdist_function = lambda a,b: torch.cdist(a,b)
cdist_profile = profile("distance", run_operation2(dim=2048, operation=cdist_function))

table:--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                     aten::cdist         1.30%     211.340us        56.56%       9.199ms       9.199ms       0.000us         0.00%       7.640ms       7.640ms             1  
                                                           aten::_euclidean_dist        18.65%       

In [None]:
gelu_fn = lambda a,b: torch.nn.functional.gelu(a+b)
gelu_profile = profile("gelu", run_operation2(dim=2048, operation=gelu_fn))

table:--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                       aten::add        71.76%       2.307ms        92.95%       2.988ms       2.988ms     195.477us        58.66%     195.477us     195.477us             1  
void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add...         0.00%       

In [None]:
# torch.long is default int64 dtype for this 2d list
a = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.float32)
print(a.shape)
print(torch.softmax(a,dim=0))
print(torch.softmax(a,dim=-1))


torch.Size([2, 3])
tensor([[0.0474, 0.0474, 0.0474],
        [0.9526, 0.9526, 0.9526]])
tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])


In [None]:
x = torch.tensor(list(range(1, 11)), dtype=torch.float32)

foo = lambda x: torch.softmax(x, dim=0)

prob = foo(x)

print(f"prob:{prob}")
prob.sum()

prob:tensor([7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
        3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01])


tensor(1.0000)

In [None]:
# this doesnt work
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


How to measure warp usage

In [None]:
import torch, time
import torch.cuda.nvtx as nvtx

# Simulate large tensor (256MB)
size_mb = 256
num_elements = (size_mb * 1024 * 1024) // 4  # float32 = 4B
x_cpu = torch.randn(num_elements, dtype=torch.float32)

# Warmup (unprofiled)
for _ in range(5):
    _ = x_cpu.to('cuda', non_blocking=True)
torch.cuda.synchronize()

# Profiling with NVTX
repeats = 10
nvtx.range_push("DRAM Transfer Benchmark")
start = time.time()
for i in range(repeats):
    nvtx.range_push(f"transfer_{i}")
    _ = x_cpu.to('cuda', non_blocking=True)
    nvtx.range_pop()
torch.cuda.synchronize()
elapsed = time.time() - start
nvtx.range_pop()

bw_gbps = (size_mb * repeats / elapsed) / 1024
print(f"Avg bandwidth: {bw_gbps:.2f} GB/s")