In [19]:
import torch
import math
import time
from LLTMpython.LLTM import LLTM as LLTMpython
from LLTMcpp.LLTM import LLTM as LLTMcpp
from LLTMcuda.LLTM import LLTM as LLTMcuda

assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU
cpu_device = torch.device("cpu")


batch_size = 16
input_features = 32
state_size = 128
test_iters = 10000

X = torch.randn(batch_size, input_features, device=cpu_device)
h = torch.randn(batch_size, state_size, device=cpu_device)
C = torch.randn(batch_size, state_size, device=cpu_device)

# Note the device=cuda_device arguments here
X_gpu = torch.randn(batch_size, input_features, device=cuda_device)
h_gpu = torch.randn(batch_size, state_size, device=cuda_device)
C_gpu = torch.randn(batch_size, state_size, device=cuda_device)


In [22]:
# ======================================
# test basic python lltm on cpu

lltm_python = LLTMpython(input_features, state_size).to(cpu_device)

python_cpu_forward = 0
python_cpu_backward = 0

for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_python(X, (h, C))
    python_cpu_forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    python_cpu_backward += time.time() - start

print('lltm python on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_cpu_forward * 1e6/test_iters, python_cpu_backward * 1e6/test_iters))

lltm python on cpu: Forward: 148.375 us | Backward 185.603 us


In [21]:
# ======================================
# test basic python lltm on gpu

lltm_python = LLTMpython(input_features, state_size).to(cuda_device)

python_gpu_forward = 0
python_gpu_backward = 0

for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_python(X_gpu, (h_gpu, C_gpu))
    python_gpu_forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    python_gpu_backward += time.time() - start

print('lltm python on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_gpu_forward * 1e6/test_iters, python_gpu_backward * 1e6/test_iters))

lltm python on gpu: Forward: 177.758 us | Backward 336.158 us


In [23]:
# ======================================
# test cpp extended lltm on cpu

lltm_cpp = LLTMcpp(input_features, state_size).to(cpu_device)

cpp_cpu_forward = 0
cpp_cpu_backward = 0
for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_cpp(X, (h, C))
    cpp_cpu_forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    cpp_cpu_backward += time.time() - start

print('lltm cpp on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_cpu_forward * 1e6/test_iters, cpp_cpu_backward * 1e6/test_iters))

lltm cpp Forward: 129.469 us | Backward 332.626 us


In [24]:
# ======================================
# test cpp extended lltm on gpu

lltm_cpp = LLTMcpp(input_features, state_size).to(cuda_device)

cpp_gpu_forward = 0
cpp_gpu_backward = 0
for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_cpp(X_gpu, (h_gpu, C_gpu))
    cpp_gpu_forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    cpp_gpu_backward += time.time() - start

print('lltm cpp on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_gpu_forward * 1e6/test_iters, cpp_gpu_backward * 1e6/test_iters))

lltm cpp on gpu Forward: 150.908 us | Backward 575.817 us


In [25]:
# ========================================
# test cuda lltm on gpu

lltm_cuda = LLTMcuda(input_features, state_size).to(cuda_device)

cuda_gpu_forward = 0
cuda_gpu_backward = 0
for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_cuda(X_gpu, (h_gpu, C_gpu))
    torch.cuda.synchronize()
    cuda_gpu_forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    torch.cuda.synchronize()
    cuda_gpu_backward += time.time() - start

print('lltm cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cuda_gpu_forward * 1e6/test_iters, cuda_gpu_backward * 1e6/test_iters))

Forward: 113.235 us | Backward 308.208 us


In [27]:
# print all results
print('lltm python on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_cpu_forward * 1e6/test_iters, python_cpu_backward * 1e6/test_iters))
print('lltm python on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_gpu_forward * 1e6/test_iters, python_gpu_backward * 1e6/test_iters))
print('lltm cpp on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_cpu_forward * 1e6/test_iters, cpp_cpu_backward * 1e6/test_iters))
print('lltm cpp on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_gpu_forward * 1e6/test_iters, cpp_gpu_backward * 1e6/test_iters))
print('lltm cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cuda_gpu_forward * 1e6/test_iters, cuda_gpu_backward * 1e6/test_iters))

lltm python on cpu: Forward: 148.375 us | Backward 185.603 us
lltm python on gpu: Forward: 177.758 us | Backward 336.158 us
lltm cpp on cpu: Forward: 129.469 us | Backward 332.626 us
lltm cpp on gpu: Forward: 150.908 us | Backward 575.817 us
lltm cuda on gpu: Forward: 113.235 us | Backward 308.208 us
