In [14]:
import torch
import math
import time
from LLTMpython.LLTM import LLTM as LLTMpython
from LLTMcpp.LLTM import LLTM as LLTMcpp
from LLTMcuda.LLTM import LLTM as LLTMcuda
from LLTMfast.LLTM import LLTM as LLTMfast
from LLTMfastseq.LLTM import LLTM as LLTMfastseq

assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU
cpu_device = torch.device("cpu")


batch_size = 32
input_features = 300
state_size = 64
input_seq_len = 2000

X = torch.randn(batch_size, input_seq_len, input_features, device=cpu_device)
h = torch.randn(batch_size, state_size, device=cpu_device)
C = torch.randn(batch_size, state_size, device=cpu_device)

# Note the device=cuda_device arguments here
X_gpu = torch.randn(batch_size, input_seq_len, input_features, device=cuda_device)
h_gpu = torch.randn(batch_size, state_size, device=cuda_device)
C_gpu = torch.randn(batch_size, state_size, device=cuda_device)


In [16]:
# ======================================
# test basic python lltm on cpu

lltm_python = LLTMpython(input_features, state_size).to(cpu_device)

python_cpu_forward = 0
python_cpu_backward = 0

xx = X
hh = h
cc = C

# forward
start = time.time()
for t in range(input_seq_len):
    new_hh, new_cc = lltm_python(xx[:,t,:].squeeze(), (hh, cc))
    hh = new_hh
    cc = new_cc
python_cpu_forward += time.time() - start

# backprop
start = time.time()
(hh.sum() + cc.sum()).backward()
python_cpu_backward += time.time() - start

print('lltm python on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_cpu_forward * 1e6/input_seq_len, python_cpu_backward * 1e6/input_seq_len))

lltm python on cpu: Forward: 138.787 us | Backward 144.174 us


In [18]:
# ======================================
# test basic python lltm on gpu

lltm_python = LLTMpython(input_features, state_size).to(cuda_device)

python_gpu_forward = 0
python_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

# forward
start = time.time()
for t in range(input_seq_len):
    new_hh, new_cc = lltm_python(xx[:,t,:].squeeze(), (hh, cc))    
    hh = new_hh
    cc = new_cc
torch.cuda.synchronize()
python_gpu_forward += time.time() - start

# backprop
start = time.time()
(hh.sum() + cc.sum()).backward()
torch.cuda.synchronize()
python_gpu_backward += time.time() - start

print('lltm python on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_gpu_forward * 1e6/input_seq_len, python_gpu_backward * 1e6/input_seq_len))

lltm python on gpu: Forward: 145.556 us | Backward 149.473 us


In [21]:
# ======================================
# test cpp extended lltm on cpu

lltm_cpp = LLTMcpp(input_features, state_size).to(cpu_device)

cpp_cpu_forward = 0
cpp_cpu_backward = 0

xx = X
hh = h
cc = C

# forward
start = time.time()
for t in range(input_seq_len):
    new_hh, new_cc = lltm_cpp(xx[:,t,:].squeeze(), (hh, cc))
    hh = new_hh
    cc = new_cc
cpp_cpu_forward += time.time() - start

# backprop
start = time.time()
(hh.sum() + cc.sum()).backward()
cpp_cpu_backward += time.time() - start

print('lltm cpp on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_cpu_forward * 1e6/input_seq_len, cpp_cpu_backward * 1e6/input_seq_len))

lltm cpp on cpu: Forward: 125.665 us | Backward 222.874 us


In [23]:
# ======================================
# test cpp extended lltm on gpu

lltm_cpp = LLTMcpp(input_features, state_size).to(cuda_device)

cpp_gpu_forward = 0
cpp_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

# forward
start = time.time()
for t in range(input_seq_len):
    new_hh, new_cc = lltm_cpp(xx[:,t,:].squeeze(), (hh, cc))    
    hh = new_hh
    cc = new_cc
torch.cuda.synchronize()
cpp_gpu_forward += time.time() - start

# backprop
start = time.time()
(hh.sum() + cc.sum()).backward()
torch.cuda.synchronize()
cpp_gpu_backward += time.time() - start

print('lltm cpp on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_gpu_forward * 1e6/input_seq_len, cpp_gpu_backward * 1e6/input_seq_len))

lltm cpp on gpu: Forward: 117.089 us | Backward 316.190 us


In [25]:
# ========================================
# test cuda lltm on gpu

lltm_cuda = LLTMcuda(input_features, state_size).to(cuda_device)

cuda_gpu_forward = 0
cuda_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

# forward
start = time.time()
for t in range(input_seq_len):
    new_hh, new_cc = lltm_cuda(xx[:,t,:].squeeze().contiguous(), (hh, cc))    
    hh = new_hh
    cc = new_cc
torch.cuda.synchronize()
cuda_gpu_forward += time.time() - start

# backprop
start = time.time()
(hh.sum() + cc.sum()).backward()
torch.cuda.synchronize()
cuda_gpu_backward += time.time() - start

print('lltm cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cuda_gpu_forward * 1e6/input_seq_len, cuda_gpu_backward * 1e6/input_seq_len))

lltm cuda on gpu: Forward: 100.728 us | Backward 98.394 us


In [27]:
# ========================================
# test fast cuda lltm on gpu

lltm_fast = LLTMfast(input_features, state_size).to(cuda_device)

fast_gpu_forward = 0
fast_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

# forward
start = time.time()
for t in range(input_seq_len):  
    new_hh, new_cc = lltm_fast(xx[:,t,:].squeeze().contiguous(), (hh, cc))    
    hh = new_hh
    cc = new_cc
torch.cuda.synchronize()
fast_gpu_forward += time.time() - start

# backprop
start = time.time()
(hh.sum() + cc.sum()).backward()
torch.cuda.synchronize()
fast_gpu_backward += time.time() - start

print('lltm fast cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(fast_gpu_forward * 1e6/input_seq_len, fast_gpu_backward * 1e6/input_seq_len))

lltm fast cuda on gpu: Forward: 136.870 us | Backward 95.622 us


In [30]:
# ========================================
# test fastseq cuda lltm on gpu

lltm_fastseq = LLTMfastseq(input_features, state_size).to(cuda_device)

fastseq_gpu_forward = 0
fastseq_gpu_backward = 0

xx = X_gpu
hh = h_gpu
cc = C_gpu

# forward
start = time.time()
new_hh, new_cc = lltm_fastseq(xx, (hh, cc))    
torch.cuda.synchronize()
fastseq_gpu_forward += time.time() - start



print('lltm fastseq cuda on gpu: Forward: {:.3f} us '.format(fastseq_gpu_forward * 1e6/input_seq_len))

lltm fastseq cuda on gpu: Forward: 88.870 us 


In [31]:
# print all results
print('lltm python on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_cpu_forward * 1e6/input_seq_len, python_cpu_backward * 1e6/input_seq_len))
print('lltm python on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(python_gpu_forward * 1e6/input_seq_len, python_gpu_backward * 1e6/input_seq_len))
print('lltm cpp on cpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_cpu_forward * 1e6/input_seq_len, cpp_cpu_backward * 1e6/input_seq_len))
print('lltm cpp on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cpp_gpu_forward * 1e6/input_seq_len, cpp_gpu_backward * 1e6/input_seq_len))
print('lltm cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(cuda_gpu_forward * 1e6/input_seq_len, cuda_gpu_backward * 1e6/input_seq_len))
print('lltm fast cuda on gpu: Forward: {:.3f} us | Backward {:.3f} us'.format(fast_gpu_forward * 1e6/input_seq_len, fast_gpu_backward * 1e6/input_seq_len))
print('lltm fastseq cuda on gpu: Forward: {:.3f} us '.format(fastseq_gpu_forward * 1e6/input_seq_len))


lltm python on cpu: Forward: 138.787 us | Backward 144.174 us
lltm python on gpu: Forward: 145.556 us | Backward 149.473 us
lltm cpp on cpu: Forward: 125.665 us | Backward 222.874 us
lltm cpp on gpu: Forward: 117.089 us | Backward 316.190 us
lltm cuda on gpu: Forward: 100.728 us | Backward 98.394 us
lltm fast cuda on gpu: Forward: 136.870 us | Backward 95.622 us
lltm fastseq cuda on gpu: Forward: 88.870 us 
