In [1]:
import torch
import math
import time
from LLTMpython.LLTM import LLTM as LLTMpython
from LLTMcpp.LLTM import LLTM as LLTMcpp
from LLTMcuda.LLTM import LLTM as LLTMcuda


batch_size = 16
input_features = 32
state_size = 128
test_iters = 10000

In [3]:
# ======================================
# test basic python lltm

X = torch.randn(batch_size, input_features)
h = torch.randn(batch_size, state_size)
C = torch.randn(batch_size, state_size)

lltm_python = LLTMpython(input_features, state_size)

forward = 0
backward = 0

for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_python(X, (h, C))
    forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    backward += time.time() - start

print('lltm python Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/test_iters, backward * 1e6/test_iters))



lltm python Forward: 151.484 us | Backward 187.823 us


In [9]:
# ======================================
# test cpp extended lltm

X = torch.randn(batch_size, input_features)
h = torch.randn(batch_size, state_size)
C = torch.randn(batch_size, state_size)

lltm_cpp = LLTMcpp(input_features, state_size)

forward = 0
backward = 0
for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_cpp(X, (h, C))
    forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    backward += time.time() - start

print('lltm cpp Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/test_iters, backward * 1e6/test_iters))

lltm cpp Forward: 130.727 us | Backward 327.048 us


In [12]:
# ========================================
# test cpp extended lltm on gpu

assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU

# Note the device=cuda_device arguments here
X = torch.randn(batch_size, input_features, device=cuda_device)
h = torch.randn(batch_size, state_size, device=cuda_device)
C = torch.randn(batch_size, state_size, device=cuda_device)

lltm_cpp = LLTMcpp(input_features, state_size).to(cuda_device)

forward = 0
backward = 0
for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_cpp(X, (h, C))
    torch.cuda.synchronize()
    forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    torch.cuda.synchronize()
    backward += time.time() - start

print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/test_iters, backward * 1e6/test_iters))

Forward: 138.314 us | Backward 530.430 us


In [2]:
# ========================================
# test cuda lltm on gpu

assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU

# Note the device=cuda_device arguments here
X = torch.randn(batch_size, input_features, device=cuda_device)
h = torch.randn(batch_size, state_size, device=cuda_device)
C = torch.randn(batch_size, state_size, device=cuda_device)

lltm_cuda = LLTMcuda(input_features, state_size).to(cuda_device)

forward = 0
backward = 0
for _ in range(test_iters):
    start = time.time()
    new_h, new_C = lltm_cuda(X, (h, C))
    torch.cuda.synchronize()
    forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    torch.cuda.synchronize()
    backward += time.time() - start

print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/test_iters, backward * 1e6/test_iters))

Forward: 92.028 us | Backward 259.984 us
