In [58]:
import torch
import math
import time
from LLTMpython.LLTM import LLTM as LLTMpython
from LLTMcpp.LLTM import LLTM as LLTMcpp
from LLTMcuda.LLTM import LLTM as LLTMcuda

assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU
cpu_device = torch.device("cpu")


batch_size = 32
input_features = 300
state_size = 64
input_seq_len = 2000

X = torch.randn(batch_size, input_features, input_seq_len, device=cpu_device)
h = torch.randn(batch_size, state_size, device=cpu_device)
C = torch.randn(batch_size, state_size, device=cpu_device)

# Note the device=cuda_device arguments here
X_gpu = torch.randn(batch_size, input_features, input_seq_len, device=cuda_device)
h_gpu = torch.randn(batch_size, state_size, device=cuda_device)
C_gpu = torch.randn(batch_size, state_size, device=cuda_device)


In [59]:
# ======================================
# test basic python lltm on cpu

lltm_python = LLTMpython(input_features, state_size).to(cpu_device)

python_cpu_forward = 0
python_cpu_backward = 0

xx = X
hh = h
cc = C

for t in range(input_seq_len):
    start = time.time()
    new_hh, new_cc = lltm_python(xx[:,:,t].squeeze(), (hh, cc))
    python_cpu_forward += time.time() - start
    
    hh = new_hh
    cc = new_cc

start = time.time()
(hh.sum() + cc.sum()).backward()
python_cpu_backward += time.time() - start

print('lltm python on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_cpu_forward * 1e6/test_iters, python_cpu_backward * 1e6/test_iters))

lltm python on cpu: Forward: 387.046 us | Backward 285.247 us


In [60]:
# ======================================
# test basic python lltm on gpu

lltm_python = LLTMpython(input_features, state_size).to(cuda_device)

python_gpu_forward = 0
python_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

for t in range(input_seq_len):
    start = time.time()
    new_hh, new_cc = lltm_python(xx[:,:,t].squeeze(), (hh, cc))
    python_gpu_forward += time.time() - start
    
    hh = new_hh
    cc = new_cc

start = time.time()
(hh.sum() + cc.sum()).backward()
python_gpu_backward += time.time() - start

print('lltm python on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_gpu_forward * 1e6/test_iters, python_gpu_backward * 1e6/test_iters))

lltm python on gpu: Forward: 303.913 us | Backward 308.658 us


In [61]:
# ======================================
# test cpp extended lltm on cpu

lltm_cpp = LLTMcpp(input_features, state_size).to(cpu_device)

cpp_cpu_forward = 0
cpp_cpu_backward = 0

xx = X
hh = h
cc = C

for t in range(input_seq_len):
    start = time.time()
    new_hh, new_cc = lltm_cpp(xx[:,:,t].squeeze(), (hh, cc))
    cpp_cpu_forward += time.time() - start
    
    hh = new_hh
    cc = new_cc

start = time.time()
(hh.sum() + cc.sum()).backward()
cpp_cpu_backward += time.time() - start

print('lltm cpp on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_cpu_forward * 1e6/test_iters, cpp_cpu_backward * 1e6/test_iters))

lltm cpp on cpu: Forward: 333.872 us | Backward 443.482 us


In [62]:
# ======================================
# test cpp extended lltm on gpu

lltm_cpp = LLTMcpp(input_features, state_size).to(cuda_device)

cpp_gpu_forward = 0
cpp_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

for t in range(input_seq_len):
    start = time.time()
    new_hh, new_cc = lltm_cpp(xx[:,:,t].squeeze(), (hh, cc))
    cpp_gpu_forward += time.time() - start
    
    hh = new_hh
    cc = new_cc

start = time.time()
(hh.sum() + cc.sum()).backward()
cpp_gpu_backward += time.time() - start

print('lltm cpp on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_gpu_forward * 1e6/test_iters, cpp_gpu_backward * 1e6/test_iters))

lltm cpp on gpu: Forward: 229.018 us | Backward 688.809 us


In [63]:
# ========================================
# test cuda lltm on gpu

lltm_cuda = LLTMcuda(input_features, state_size).to(cuda_device)

cuda_gpu_forward = 0
cuda_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

for t in range(input_seq_len):
    start = time.time()
    new_hh, new_cc = lltm_cuda(xx[:,:,t].squeeze().contiguous(), (hh, cc))
    cuda_gpu_forward += time.time() - start
    
    hh = new_hh
    cc = new_cc

start = time.time()
(hh.sum() + cc.sum()).backward()
cuda_gpu_backward += time.time() - start

print('lltm cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cuda_gpu_forward * 1e6/test_iters, cuda_gpu_backward * 1e6/test_iters))

lltm cuda on gpu: Forward: 214.389 us | Backward 207.433 us


In [64]:
# print all results
print('lltm python on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_cpu_forward * 1e6/test_iters, python_cpu_backward * 1e6/test_iters))
print('lltm python on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_gpu_forward * 1e6/test_iters, python_gpu_backward * 1e6/test_iters))
print('lltm cpp on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_cpu_forward * 1e6/test_iters, cpp_cpu_backward * 1e6/test_iters))
print('lltm cpp on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_gpu_forward * 1e6/test_iters, cpp_gpu_backward * 1e6/test_iters))
print('lltm cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cuda_gpu_forward * 1e6/test_iters, cuda_gpu_backward * 1e6/test_iters))

lltm python on cpu: Forward: 387.046 us | Backward 285.247 us
lltm python on gpu: Forward: 303.913 us | Backward 308.658 us
lltm cpp on cpu: Forward: 333.872 us | Backward 443.482 us
lltm cpp on gpu: Forward: 229.018 us | Backward 688.809 us
lltm cuda on gpu: Forward: 214.389 us | Backward 207.433 us
