# Test & Benchmark GPU

## Query support

In [11]:
import torch
torch.cuda.is_available()

True

## Benchmark

In [None]:
device1 = torch.device('cuda')
device2 = torch.device('cpu')

devs = [device1, device2]

def batched_dot_mul_sum(a, b):
    '''Computes batched dot by multiplying and summing'''
    return a.mul(b).sum(-1)


def batched_dot_bmm(a, b):
    '''Computes batched dot by reducing to bmm'''
    a = a.reshape(-1, 1, a.shape[-1])
    b = b.reshape(-1, b.shape[-1], 1)
    return torch.bmm(a, b).flatten(-3)


# Input for benchmarking
x = torch.randn(10000, 64, device=device1)

# Ensure that both functions compute the same output
assert batched_dot_mul_sum(x, x).allclose(batched_dot_bmm(x, x))

In [None]:
import torch.utils.benchmark as benchmark

for device in devs:
    x = x.to(device)

    t0 = benchmark.Timer(
        stmt='batched_dot_mul_sum(x, x)',
        setup='from __main__ import batched_dot_mul_sum',
        globals={'x': x})

    t1 = benchmark.Timer(
        stmt='batched_dot_bmm(x, x)',
        setup='from __main__ import batched_dot_bmm',
        globals={'x': x})

    print('Device: ', device, t0.timeit(100), t1.timeit(100))