In [65]:
# Following this guid to benchmark PyTorch operations: https://pytorch.org/tutorials/recipes/recipes/benchmark.html#benchmarking-with-torch-utils-benchmark-timer

import torch
import torch.utils.benchmark as benchmark


In [66]:
num_selections = 10

# define the functions to compare/benchmark/time
def index_using_gather(tensor, indices):
    """Selects elements from a tensor using gather (for N, 1)."""
    return torch.gather(tensor, dim=0, index=indices.unsqueeze(1))  # Since dim is 1

def index_using_integral_indexing(tensor, indices):
    """Selects elements from a tensor using integer indexing (for N, 1)."""
    return tensor[indices]  # Direct indexing on the first dimension

def index_using_index_select(tensor, indices):
    return torch.index_select(tensor, 0, indices)

# Sample tensor and indices
tensor = torch.randn(1000, 1)
indices = torch.randint(0, tensor.shape[0], (num_selections, ))  # Generate random indices for N


In [67]:
tensor = tensor.to("cuda")
indices = indices.to("cuda")

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [68]:
# Benchmarking with pytorch.utils.benchmark
t_gather = benchmark.Timer(
    stmt="index_using_gather(tensor.clone(), indices.clone())",
    setup="from __main__ import index_using_gather, tensor, indices",
)
t_indexing = benchmark.Timer(
    stmt="index_using_integral_indexing(tensor.clone(), indices.clone())",
    setup="from __main__ import index_using_integral_indexing, tensor, indices",
)
t_index_select = benchmark.Timer(
    stmt="index_using_index_select(tensor.clone(), indices.clone())",
    setup="from __main__ import index_using_index_select, tensor, indices",
)

# Repeatedly run the timers for more accurate measurements
print("Gather:")
print(t_gather.timeit(number=400000))  # Run many times for better accuracy
print("Integer Indexing:")
print(t_indexing.timeit(number=400000))
print("Index Select:")
print(t_index_select.timeit(number=400000))

# Ensure outputs are the same
assert torch.allclose(index_using_gather(tensor.clone(), indices.clone()), index_using_integral_indexing(tensor.clone(), indices.clone()))
assert torch.allclose(index_using_gather(tensor.clone(), indices.clone()), index_using_index_select(tensor.clone(), indices.clone()))


Gather:


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [62]:
tensor.device

device(type='cuda', index=0)