# Peephole LSTM Test & Performance Comparison (Speed & Memory)

* [Imports](#Importing-necessary-modules)
* [Load & Definition](#Loading-and-defining-modules)
    * [Autograd Functions](#Autograd-Functions)
    * [Module Classes](#Module-classes-(C++,-CUDA,-PyTorch))
* [Models](#Defining-models)
    * [Definition](#Definition)
    * [Instantiation](#Instantiation)
    * [Parameter Synchronization](#Parameter-Synchronization)
* [Fake Dataset](#Creating-a-fake-dataset)
* [Sanity Check](#Sanity-check:-output-comparison)
    * [Forward Outputs](#Forward-Outputs)
    * [Backward Gradients](#Backward-Gradients)
* [Forward Performance](#Forward-time-comparison)
* [+Backward Performance](#+Backward-time-comparison)

---

## Importing necessary modules
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [1]:
if 'initialized' not in globals():
    import torch
    from torch import nn
    from torch.utils.cpp_extension import load
    from torch.nn import functional as F
    from torch.utils.data import TensorDataset, DataLoader

    import math
    from collections import OrderedDict
    from time import sleep

    initialized = [False] * 7
    print(torch.__version__)

1.0.0


---

## Loading and defining modules
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

### Autograd Functions
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [2]:
if not initialized[0]:
    _ln_peephole_lstm_layer_cpp = load('ln_peephole_lstm_layer',
                                       ['./ln_peephole_lstm_layer.cpp'])
    _ln_peephole_lstm_layer_cuda = load('ln_peephole_lstm_layer_cuda',
                                        ['./ln_peephole_lstm_layer_cuda.cpp', './ln_peephole_lstm_layer_cuda_kernel.cu'])
    _ln_peephole_lstm_layer_cuda_less_mem = load('ln_peephole_lstm_layer_cuda_less_mem',
                                                 ['./ln_peephole_lstm_layer_cuda_less_mem.cpp', './ln_peephole_lstm_layer_cuda_kernel_less_mem.cu'])

    ########################################################################################################################

    class LNPeepholeLSTMFunctionCPP(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias,
                    gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell, beta_cell,
                    hidden, cell,
                    epsilon, dropout_p,
                    dropout_output, training):

            outputs = _ln_peephole_lstm_layer_cpp.forward(input, weight_ih, weight_hh, weight_ch, bias,
                                                          gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell, beta_cell,
                                                          hidden, cell,
                                                          epsilon, dropout_p,
                                                          dropout_output, training)

            out, new_h, new_cell = outputs[:3]

            variables = outputs[3:] + [weight_ih, weight_hh, weight_ch,
                                       gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell]
            ctx.save_for_backward(*variables)

            return out, new_h, new_cell

        @staticmethod
        def backward(ctx, grad_output, grad_h, grad_cell):
            outputs = _ln_peephole_lstm_layer_cpp.backward(
                grad_output.contiguous(), grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors)

            (d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
             d_gamma_f, d_gamma_i, d_gamma_o, d_gamma_g, d_gamma_cell, d_beta_cell,
             d_hidden, d_cell) = outputs

            return (d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
                    d_gamma_f, d_gamma_i, d_gamma_o, d_gamma_g, d_gamma_cell, d_beta_cell,
                    d_hidden, d_cell,
                    None, None,
                    None, None)
        
    ########################################################################################################################
    
    class LNPeepholeLSTMFunctionCUDA(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias,
                    gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell, beta_cell,
                    hidden, cell,
                    epsilon, dropout_p,
                    dropout_output, training):

            outputs = _ln_peephole_lstm_layer_cuda.forward(input, weight_ih, weight_hh, weight_ch, bias,
                                                          gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell, beta_cell,
                                                          hidden, cell,
                                                          epsilon, dropout_p,
                                                          dropout_output, training)

            out, new_h, new_cell = outputs[:3]

            variables = outputs[3:] + [weight_ih, weight_hh, weight_ch,
                                       gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell]
            ctx.save_for_backward(*variables)

            return out, new_h, new_cell

        @staticmethod
        def backward(ctx, grad_output, grad_h, grad_cell):
            outputs = _ln_peephole_lstm_layer_cuda.backward(
                grad_output.contiguous(), grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors)

            (d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
             d_gamma_f, d_gamma_i, d_gamma_o, d_gamma_g, d_gamma_cell, d_beta_cell,
             d_hidden, d_cell) = outputs

            return (d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
                    d_gamma_f, d_gamma_i, d_gamma_o, d_gamma_g, d_gamma_cell, d_beta_cell,
                    d_hidden, d_cell,
                    None, None,
                    None, None)
        
    ########################################################################################################################
    
    class LNPeepholeLSTMFunctionCUDALM(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias,
                    gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell, beta_cell,
                    hidden, cell,
                    epsilon, dropout_p,
                    dropout_output, training):

            outputs = _ln_peephole_lstm_layer_cuda_less_mem.forward(input, weight_ih, weight_hh, weight_ch, bias,
                                                          gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell, beta_cell,
                                                          hidden, cell,
                                                          epsilon, dropout_p,
                                                          dropout_output, training)

            out, new_h, new_cell = outputs[:3]

            variables = outputs[3:] + [weight_ih, weight_hh, weight_ch, bias,
                                       gamma_f, gamma_i, gamma_g, gamma_o, gamma_cell]
            ctx.save_for_backward(*variables)

            return out, new_h, new_cell

        @staticmethod
        def backward(ctx, grad_output, grad_h, grad_cell):
            outputs = _ln_peephole_lstm_layer_cuda_less_mem.backward(
                grad_output.contiguous(), grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors)

            (d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
             d_gamma_f, d_gamma_i, d_gamma_o, d_gamma_g, d_gamma_cell, d_beta_cell,
             d_hidden, d_cell) = outputs

            return (d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
                    d_gamma_f, d_gamma_i, d_gamma_o, d_gamma_g, d_gamma_cell, d_beta_cell,
                    d_hidden, d_cell,
                    None, None,
                    None, None)
        
    initialized[0] = True



RuntimeError: Error building extension 'ln_peephole_lstm_layer_cuda_less_mem': [1/3] C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin\nvcc -DTORCH_EXTENSION_NAME=ln_peephole_lstm_layer_cuda_less_mem -DTORCH_API_INCLUDE_EXTENSION_H -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\torch\csrc\api\include -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\TH -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\THC "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include" -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\Include -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -c C:\Users\0107w\Anaconda3\envs\pytorch\vs_project\Pytorch_Project\Pytorch_Project\ln_peephole_lstm_layer_cuda_kernel_less_mem.cu -o ln_peephole_lstm_layer_cuda_kernel_less_mem.cuda.o
FAILED: ln_peephole_lstm_layer_cuda_kernel_less_mem.cuda.o 
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin\nvcc -DTORCH_EXTENSION_NAME=ln_peephole_lstm_layer_cuda_less_mem -DTORCH_API_INCLUDE_EXTENSION_H -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\torch\csrc\api\include -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\TH -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\THC "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include" -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\Include -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -c C:\Users\0107w\Anaconda3\envs\pytorch\vs_project\Pytorch_Project\Pytorch_Project\ln_peephole_lstm_layer_cuda_kernel_less_mem.cu -o ln_peephole_lstm_layer_cuda_kernel_less_mem.cuda.o
C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/Exception.h(27): warning: base class dllexport/dllimport specification differs from that of the derived class

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/Exception.h(28): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/Exception.h(29): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/Exception.h(34): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/Exception.h(35): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/core/Allocator.h(126): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/core/TensorTypeIdRegistration.h(32): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/core/TensorTypeIdRegistration.h(45): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/core/TensorTypeIdRegistration.h(46): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(449): warning: dllexport/dllimport conflict with "caffe2::TypeMeta::_typeMetaDataInstance [with T=caffe2::detail::_Uninitialized]"
(445): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(560): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=uint8_t]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(561): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=int8_t]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(562): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=int16_t]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(563): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=int]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(564): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=int64_t]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(565): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=c10::Half]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(566): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=float]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(567): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=double]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(568): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=c10::ComplexHalf]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(569): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::complex<float>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(570): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::complex<double>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(573): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::string]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(574): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=__nv_bool]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(575): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=uint16_t]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(576): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=char]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(577): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::unique_ptr<std::mutex, std::default_delete<std::mutex>>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(578): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::unique_ptr<std::atomic<__nv_bool>, std::default_delete<std::atomic<__nv_bool>>>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(579): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::vector<int32_t, std::allocator<int32_t>>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(580): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::vector<int64_t, std::allocator<int64_t>>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(581): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=std::vector<unsigned long, std::allocator<unsigned long>>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(582): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=__nv_bool *]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(583): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=char *]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(584): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=int *]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(604): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=caffe2::detail::_guard_long_unique<long>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(605): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=caffe2::detail::_guard_long_unique<std::vector<long, std::allocator<long>>>]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/typeid.h(609): warning: dllexport/dllimport conflict with "caffe2::TypeIdentifier::Get [with T=caffe2::_CaffeHighestPreallocatedTypeId]"
(83): here; dllexport assumed

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/intrusive_ptr.h(58): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/intrusive_ptr.h(59): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/core/StorageImpl.h(215): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/core/Storage.h(184): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\c10/util/logging_is_not_google_glog.h(47): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/core/TensorImpl.h(115): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/core/TensorImpl.h(1434): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/core/TensorImpl.h(1435): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/core/Tensor.h(692): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/core/Tensor.h(720): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/Context.h(145): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/Context.h(146): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/Context.h(147): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/Context.h(151): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/Context.h(152): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/Context.h(153): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/TensorGeometry.h(56): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch1.0/lib/site-packages/torch/lib/include\ATen/TensorGeometry.h(57): warning: field of class type without a DLL interface used in a class with a DLL interface

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(937): error: identifier "gates_fig" is undefined

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(939): error: identifier "tanh_new_cells" is undefined

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: identifier "gates_o" is undefined

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: identifier "gates_o" is undefined

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: type name is not allowed

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(963): error: expected an expression

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(448): error: identifier "d_sigmoid_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=double]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(461): error: identifier "d_sigmoid_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=double]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(475): error: identifier "d_tanh_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=double]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(487): error: identifier "d_sigmoid_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=double]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(505): error: identifier "d_tanh_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=double]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(448): error: identifier "d_sigmoid_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=float]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(461): error: identifier "d_sigmoid_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=float]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(475): error: identifier "d_tanh_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=float]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(487): error: identifier "d_sigmoid_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=float]" 
(963): here

C:/Users/0107w/Anaconda3/envs/pytorch/vs_project/Pytorch_Project/Pytorch_Project/ln_peephole_lstm_layer_cuda_kernel_less_mem.cu(505): error: identifier "d_tanh_with_output" is undefined
          detected during instantiation of "void backward_preparation(scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, const scalar_t *, scalar_t *, scalar_t *, scalar_t *, scalar_t *, const scalar_t *, scalar_t *, int64_t, int64_t, int64_t, int64_t, int64_t) [with scalar_t=float]" 
(963): here

30 errors detected in the compilation of "C:/Users/0107w/AppData/Local/Temp/tmpxft_00001ec8_00000000-10_ln_peephole_lstm_layer_cuda_kernel_less_mem.cpp1.ii".
ln_peephole_lstm_layer_cuda_kernel_less_mem.cu
[2/3] cl /showIncludes -DTORCH_EXTENSION_NAME=ln_peephole_lstm_layer_cuda_less_mem -DTORCH_API_INCLUDE_EXTENSION_H -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\torch\csrc\api\include -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\TH -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\THC "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include" -IC:\Users\0107w\Anaconda3\envs\pytorch1.0\Include -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++11 -c C:\Users\0107w\Anaconda3\envs\pytorch\vs_project\Pytorch_Project\Pytorch_Project\ln_peephole_lstm_layer_cuda_less_mem.cpp /Foln_peephole_lstm_layer_cuda_less_mem.o
Microsoft (R) C/C++ Optimizing Compiler Version 19.11.25548.2 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.

cl : Command line warning D9002 : ignoring unknown option '-fPIC'
cl : Command line warning D9002 : ignoring unknown option '-std=c++11'
C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\include\xlocale(314): warning C4530: C++ exception handler used, but unwind semantics are not enabled. Specify /EHsc
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/typeid.h(433): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/typeid.h(434): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/typeid.h(435): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/typeid.h(436): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/typeid.h(438): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(75): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(76): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(77): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(78): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(79): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(86): warning C4068: unknown pragma
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(80): warning C4297: 'c10::intrusive_ptr_target::~intrusive_ptr_target': function assumed not to throw an exception but does
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(80): note: destructor or deallocator has a (possibly implicit) non-throwing exception specification
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(83): warning C4297: 'c10::intrusive_ptr_target::~intrusive_ptr_target': function assumed not to throw an exception but does
C:\Users\0107w\Anaconda3\envs\pytorch1.0\lib\site-packages\torch\lib\include\c10/util/intrusive_ptr.h(83): note: destructor or deallocator has a (possibly implicit) non-throwing exception specification
Including torch/torch.h for C++ extensions is deprecated. Please include torch/extension.h
C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\include\iosfwd(292): warning C4577: 'noexcept' used with no exception handling mode specified; termination on exception is not guaranteed. Specify /EHsc
ninja: build stopped: subcommand failed.


### Module classes (PyTorch, C++, CUDA)
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [14]:
if not initialized[1]:
    class LNPeepholeLSTMTorch(nn.Module):
        def __init__(self, input_size, hidden_size, batch_first=False, dropout=0., dropout_on_output=True, eps=1e-05):
            if not 0 <= dropout <= 1:
                raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
            super(LNPeepholeLSTMTorch, self).__init__()

            self.input_size = input_size
            self.hidden_size = hidden_size
            self.batch_first = bool(batch_first)
            self.dropout = float(dropout)
            self.dropout_on_output = bool(dropout_on_output)
            self.eps = eps

            self.register_parameter('weight_ih', nn.Parameter(torch.empty(4 * hidden_size, input_size)))
            self.register_parameter('weight_hh', nn.Parameter(torch.empty(4 * hidden_size, hidden_size)))
            self.register_parameter('weight_ch', nn.Parameter(torch.empty(3 * hidden_size)))
            self.register_parameter('bias', nn.Parameter(torch.empty(4 * hidden_size)))

            self.register_parameter('gamma_f', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_i', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_g', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_o', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_cell', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('beta_cell', nn.Parameter(torch.empty(hidden_size)))

            self.reset_parameters()

        def reset_parameters(self):
            stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
            self.weight_ih.data.uniform_(-stdv, +stdv)
            self.weight_hh.data.uniform_(-stdv, +stdv)
            self.weight_ch.data.uniform_(-stdv, +stdv)

            self.bias.data.zero_()
            self.bias.data[:self.hidden_size].fill_(1.)

            self.gamma_f.data.uniform_()
            self.gamma_i.data.uniform_()
            self.gamma_g.data.uniform_()
            self.gamma_o.data.uniform_()
            self.gamma_cell.data.uniform_()
            self.beta_cell.data.zero_()

        def forward(self, input, states):
            assert input.dim() == 3, "expected a 3 dimensional tensor as `input`, but te given tensor has {} dimension(s)".format(input.dim())
            assert len(states) == 2, "expected a (hidden, cell) pair as `states`, but the length of the given states is {}".format(len(states))
            if self.batch_first:
                input = input.transpose(0, 1).contiguous()
            assert states[0].size() == (input.size(1), self.hidden_size), "expected a hidden state tensor with dimensionality {}, but the given tensor has dimensionality []".format(list(states[0].size()), [input.size(1), self.hidden_size])
            assert states[1].size() == (input.size(1), self.hidden_size), "expected a cell state tensor with dimensionality {}, but the given tensor has dimensionality []".format(list(states[1].size()), [input.size(1), self.hidden_size])

            hidden, cell = states

            hidden_size = self.hidden_size
            hidden_size_2 = 2 * hidden_size
            hidden_size_3 = hidden_size_2 + hidden_size

            norm_shape = torch.Size((hidden_size,))

            outputs = input.new_empty((input.size(0), input.size(1), hidden_size))
            
            ih = input.matmul(self.weight_ih.t())

            weight_hc_h = torch.cat((self.weight_hh.t(),
                                     torch.cat((self.weight_ch[:hidden_size].diag(),
                                                self.weight_ch[hidden_size:hidden_size_2].diag(),
                                                self.weight_ch.new_zeros(hidden_size_2, hidden_size))).t()))
            weight_co = self.weight_ch[hidden_size_2:]
            
            gamma_fig = torch.stack((self.gamma_f, self.gamma_i, self.gamma_g))

            bias_fig = torch.stack(self.bias[:hidden_size_3].chunk(3, dim=0))
            bias_o = self.bias[hidden_size_3:]

            for i in range(input.size(0)):
                gates = torch.addmm(ih[i], torch.cat((hidden, cell), dim=1), weight_hc_h).view(-1, 4, hidden_size)
                gates_fig = gates[:, :3]


                gates_fig = F.layer_norm(gates_fig, norm_shape, eps=self.eps)
                gates_fig = torch.addcmul(bias_fig, gates_fig, gamma_fig)
                forget_input_gates = gates_fig[:, :2].sigmoid()
                candidate_cell = F.dropout(gates_fig[:, 2].tanh(), p=self.dropout, training=self.training)


                cell = F.layer_norm(torch.addcmul(forget_input_gates[:, 0] * cell,
                                                  forget_input_gates[:, 1], candidate_cell),
                                    norm_shape, self.gamma_cell, self.beta_cell, self.eps)

                output_gate = torch.addcmul(gates[:, 3], cell, weight_co)

                output_gate = F.layer_norm(output_gate, norm_shape, self.gamma_o, bias_o, self.eps).sigmoid()

                hidden = output_gate * cell.tanh()

                outputs[i] = hidden

            if self.dropout_on_output:
                outputs = F.dropout(outputs, p=self.dropout, training=self.training)
                
            if self.batch_first:
                outputs = outputs.transpose(0, 1).contiguous()

            return outputs, (hidden, cell)

        def __repr__(self):
            return f"LNPeepholeLSTMTorch(input_size={self.input_size}, hidden_size={self.hidden_size}, batch_first={self.batch_first}, dropout={self.dropout}, dropout_on_output={self.dropout_on_output}, eps={self.eps})"

    ########################################################################################################################

    class LNPeepholeLSTMCPP(nn.Module):
        def __init__(self, input_size, hidden_size, batch_first=False, dropout=0., dropout_on_output=True, eps=1e-05):
            if not 0 <= dropout <= 1:
                raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")

            super(LNPeepholeLSTMCPP, self).__init__()

            self.input_size = input_size
            self.hidden_size = hidden_size
            self.batch_first = bool(batch_first)
            self.dropout = float(dropout)
            self.dropout_on_output = bool(dropout_on_output)
            self.eps = eps

            self.register_parameter('weight_ih', nn.Parameter(torch.empty(4 * hidden_size, input_size)))
            self.register_parameter('weight_hh', nn.Parameter(torch.empty(4 * hidden_size, hidden_size)))
            self.register_parameter('weight_ch', nn.Parameter(torch.empty(3 * hidden_size)))
            self.register_parameter('bias', nn.Parameter(torch.empty(4 * hidden_size)))

            self.register_parameter('gamma_f', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_i', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_g', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_o', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_cell', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('beta_cell', nn.Parameter(torch.empty(hidden_size)))

            self.reset_parameters()

        def reset_parameters(self):
            stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
            self.weight_ih.data.uniform_(-stdv, +stdv)
            self.weight_hh.data.uniform_(-stdv, +stdv)
            self.weight_ch.data.uniform_(-stdv, +stdv)

            self.bias.data.zero_()
            self.bias.data[:self.hidden_size].fill_(1.)

            self.gamma_f.data.uniform_()
            self.gamma_i.data.uniform_()
            self.gamma_g.data.uniform_()
            self.gamma_o.data.uniform_()
            self.gamma_cell.data.uniform_()
            self.beta_cell.data.zero_()

        def forward(self, input, state):
            if self.batch_first:
                input = input.transpose(0, 1).contiguous()

            output, new_h, new_cell = LNPeepholeLSTMFunctionCPP.apply(
                input, self.weight_ih, self.weight_hh, self.weight_ch, self.bias,
                self.gamma_f, self.gamma_i, self.gamma_g, self.gamma_o, self.gamma_cell, self.beta_cell,
                state[0], state[1],
                self.eps, self.dropout, self.dropout_on_output, self.training)
            
            if self.batch_first:
                output = output.transpose(0, 1).contiguous()

            return output, (new_h, new_cell)

        def __repr__(self):
            return f"LNPeepholeLSTMCPP(input_size={self.input_size}, hidden_size={self.hidden_size}, batch_first={self.batch_first}, dropout={self.dropout}, dropout_on_output={self.dropout_on_output}, eps={self.eps})"
    
    ########################################################################################################################
    
    class LNPeepholeLSTMCUDA(nn.Module):
        def __init__(self, input_size, hidden_size, batch_first=False, dropout=0., dropout_on_output=True, eps=1e-05):
            if not 0 <= dropout <= 1:
                raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")

            super(LNPeepholeLSTMCUDA, self).__init__()

            self.input_size = input_size
            self.hidden_size = hidden_size
            self.batch_first = bool(batch_first)
            self.dropout = float(dropout)
            self.dropout_on_output = bool(dropout_on_output)
            self.eps = eps

            self.register_parameter('weight_ih', nn.Parameter(torch.empty(4 * hidden_size, input_size)))
            self.register_parameter('weight_hh', nn.Parameter(torch.empty(4 * hidden_size, hidden_size)))
            self.register_parameter('weight_ch', nn.Parameter(torch.empty(3 * hidden_size)))
            self.register_parameter('bias', nn.Parameter(torch.empty(4 * hidden_size)))

            self.register_parameter('gamma_f', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_i', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_g', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_o', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('gamma_cell', nn.Parameter(torch.empty(hidden_size)))
            self.register_parameter('beta_cell', nn.Parameter(torch.empty(hidden_size)))

            self.reset_parameters()

        def reset_parameters(self):
            stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
            self.weight_ih.data.uniform_(-stdv, +stdv)
            self.weight_hh.data.uniform_(-stdv, +stdv)
            self.weight_ch.data.uniform_(-stdv, +stdv)

            self.bias.data.zero_()
            self.bias.data[:self.hidden_size].fill_(1.)

            self.gamma_f.data.uniform_()
            self.gamma_i.data.uniform_()
            self.gamma_g.data.uniform_()
            self.gamma_o.data.uniform_()
            self.gamma_cell.data.uniform_()
            self.beta_cell.data.zero_()

        def forward(self, input, state):
            if self.batch_first:
                input = input.transpose(0, 1).contiguous()

            output, new_h, new_cell = LNPeepholeLSTMFunctionCUDA.apply(
                input, self.weight_ih, self.weight_hh, self.weight_ch, self.bias,
                self.gamma_f, self.gamma_i, self.gamma_g, self.gamma_o, self.gamma_cell, self.beta_cell,
                state[0], state[1],
                self.eps, self.dropout, self.dropout_on_output, self.training)
            
            if self.batch_first:
                output = output.transpose(0, 1).contiguous()

            return output, (new_h, new_cell)

        def __repr__(self):
            return f"LNPeepholeLSTMCUDA(input_size={self.input_size}, hidden_size={self.hidden_size}, batch_first={self.batch_first}, dropout={self.dropout}, dropout_on_output={self.dropout_on_output}, eps={self.eps})"
    
    initialized[1] = True

---

## Defining models
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

### Definition
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [15]:
if not initialized[2]:
    class LNPeepholeTorch(nn.Module):
        def __init__(self, input_size, hidden_size, output_size, n_layers=1, dropout=0, dropout_on_output=True, eps=1e-05):
            super().__init__()
            assert isinstance(n_layers, int)
            assert n_layers > 0

            self.lstm0 = LNPeepholeLSTMTorch(input_size, hidden_size, True, dropout, dropout_on_output, eps)
            for n in range(1, n_layers):
                self.add_module("lstm{}".format(n), LNPeepholeLSTMTorch(hidden_size, hidden_size, True, dropout, dropout_on_output, eps))
            self.fc = nn.Linear(hidden_size, output_size)

            self.input_size = input_size
            self.hidden_size = hidden_size
            self.output_size = output_size
            self.n_layers = n_layers

        def forward(self, x, states):
            assert states[0].dim() == 3
            assert states[0].size(0) == self.n_layers
            assert states[0].size(1) == x.size(0)
            assert states[0].size(2) == self.hidden_size
            assert states[0].size() == states[1].size()

            new_hidden = torch.empty_like(states[0])
            new_cell = torch.empty_like(states[1])

            for n in range(self.n_layers):
                x, (new_hidden[n], new_cell[n]) = getattr(self, "lstm{}".format(n))(x, (states[0][n], states[1][n]))
            x = self.fc(x)

            return x, (new_hidden, new_cell)

    ########################################################################################################################

    class LNPeepholeCPP(nn.Module):
        def __init__(self, input_size, hidden_size, output_size, n_layers=1, dropout=0, dropout_on_output=True, eps=1e-05):
            super().__init__()
            assert isinstance(n_layers, int)
            assert n_layers > 0

            self.lstm0 = LNPeepholeLSTMCPP(input_size, hidden_size, True, dropout, dropout_on_output, eps)
            for n in range(1, n_layers):
                self.add_module("lstm{}".format(n), LNPeepholeLSTMCPP(hidden_size, hidden_size, True, dropout, dropout_on_output, eps))
            self.fc = nn.Linear(hidden_size, output_size)

            self.input_size = input_size
            self.hidden_size = hidden_size
            self.output_size = output_size
            self.n_layers = n_layers

        def forward(self, x, states):
            assert states[0].dim() == 3
            assert states[0].size(0) == self.n_layers
            assert states[0].size(1) == x.size(0)
            assert states[0].size(2) == self.hidden_size
            assert states[0].size() == states[1].size()

            new_hidden = torch.empty_like(states[0])
            new_cell = torch.empty_like(states[1])

            for n in range(self.n_layers):
                x, (new_hidden[n], new_cell[n]) = getattr(self, "lstm{}".format(n))(x, (states[0][n], states[1][n]))
            x = self.fc(x)

            return x, (new_hidden, new_cell)
    
    ########################################################################################################################
    
    class LNPeepholeCUDA(nn.Module):
        def __init__(self, input_size, hidden_size, output_size, n_layers=1, dropout=0, dropout_on_output=True, eps=1e-05):
            super().__init__()
            assert isinstance(n_layers, int)
            assert n_layers > 0

            self.lstm0 = LNPeepholeLSTMCUDA(input_size, hidden_size, True, dropout, dropout_on_output, eps)
            for n in range(1, n_layers):
                self.add_module("lstm{}".format(n), LNPeepholeLSTMCUDA(hidden_size, hidden_size, True, dropout, dropout_on_output, eps))
            self.fc = nn.Linear(hidden_size, output_size)

            self.input_size = input_size
            self.hidden_size = hidden_size
            self.output_size = output_size
            self.n_layers = n_layers

        def forward(self, x, states):
            assert states[0].dim() == 3
            assert states[0].size(0) == self.n_layers
            assert states[0].size(1) == x.size(0)
            assert states[0].size(2) == self.hidden_size
            assert states[0].size() == states[1].size()

            new_hidden = torch.empty_like(states[0])
            new_cell = torch.empty_like(states[1])

            for n in range(self.n_layers):
                x, (new_hidden[n], new_cell[n]) = getattr(self, "lstm{}".format(n))(x, (states[0][n], states[1][n]))
            x = self.fc(x)

            return x, (new_hidden, new_cell)
    
    initialized[2] = True

### Instantiation
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [16]:
if not initialized[3]:
    device = ('cpu', 'cuda')[1]

    input_size = 200 #TEST 5
    hidden_size = 500 #TEST 8
    output_size = 100 #TEST 6
    n_layers = 4 #TEST 3
    dropout = 0. #TEST 0
    eps = 1e-05 #TEST 1e-05

    model_torch = LNPeepholeTorch(input_size, hidden_size, output_size, n_layers, dropout, eps)
    model_cpp = LNPeepholeCPP(input_size, hidden_size, output_size, n_layers, dropout, eps)
    model_cuda = LNPeepholeCUDA(input_size, hidden_size, output_size, n_layers, dropout, eps)

    model_torch.to(device)
    model_cpp.to(device)
    model_cuda.to(device)

    models = (model_torch, model_cpp, model_cuda)
    
    initialized[3] = True

### Parameter Synchronization
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [17]:
if not initialized[4]:
    named_parameter_dicts = [
        dict(model_torch.named_parameters()),
        dict(model_cpp.named_parameters()),
        dict(model_cuda.named_parameters())
    ]

    print("Synchronized Parameters:\n")
    for common_param_name in set.intersection(*(set(npd.keys()) for npd in named_parameter_dicts)):
        print("\t{}".format(common_param_name))
        for i in range(1, len(named_parameter_dicts)):
            if named_parameter_dicts[i][common_param_name].size() == named_parameter_dicts[0][common_param_name].size():
                named_parameter_dicts[i][common_param_name].data = named_parameter_dicts[0][common_param_name].data
            else:
                raise RuntimeError("Size mismatch\n0:{}\n{i}:{}".format(named_parameter_dicts[0][common_param_name].size(),
                                                                        named_parameter_dicts[i][common_param_name].size()))
    print()
    print("Exclusive Parameters (Not Synchronized):\n")
    for exclusive_param_name in set.union(*(set(npd.keys()) for npd in named_parameter_dicts)) - set.intersection(*(set(npd.keys()) for npd in named_parameter_dicts)):
        print("\t{}".format(exclusive_param_name))
        
    initialized[4] = True

---

## Creating a fake dataset
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [18]:
if not initialized[5]:
    def create_fake_loader(dataset_size, sequence_length, batch_size, drop_last=True):
        fake_inputs = torch.randn(dataset_size, sequence_length, input_size)
        fake_targets = torch.randint(high=output_size, size=(dataset_size, sequence_length), dtype=torch.int64)

        fake_dataset = TensorDataset(fake_inputs, fake_targets)

        fake_loader = DataLoader(fake_dataset, batch_size=batch_size, drop_last=drop_last)

        return fake_loader
    
    initialized[5] = True

In [19]:
if not initialized[6]:
    dataset_size = 1000
    sequence_length = 20 #TEST 20
    batch_size = 8 #TEST 8

    fake_loader = create_fake_loader(dataset_size, sequence_length, batch_size)
    print(next(iter(fake_loader))[0].size(), next(iter(fake_loader))[1].size())
    
    initialized[6] = True

---

## Sanity check: output comparison
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

### Forward Outputs
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [9]:
hidden = (torch.zeros(n_layers, batch_size, hidden_size, device=device), torch.zeros(n_layers, batch_size, hidden_size, device=device))

inputs = next(iter(fake_loader))[0].to(device)

for model in models:
    model.train()
#     model.eval()
del model # Removing temporary variable

In [11]:
with torch.no_grad():
    print("[model_torch]")
    print("\n{partial output}")
    out0 = model_torch(inputs, hidden)
    print(out0)

[model_torch]

{partial output}
(tensor([[[ 2.1202e-01,  2.8098e-01, -1.6853e-01,  1.5387e-01,  1.8006e-01,
          -1.8520e-01],
         [ 3.6610e-01,  2.3259e-01, -9.6601e-02,  1.8165e-01,  1.6299e-01,
          -2.6788e-01],
         [ 4.3691e-01,  1.9658e-01, -6.7583e-02,  2.0776e-01,  1.5565e-01,
          -2.6871e-01],
         [ 4.6849e-01,  1.7438e-01, -4.7811e-02,  2.3025e-01,  1.5583e-01,
          -2.6347e-01],
         [ 4.6763e-01,  1.6628e-01, -4.6389e-02,  2.3983e-01,  1.6276e-01,
          -2.5412e-01],
         [ 4.6391e-01,  1.6435e-01, -4.9311e-02,  2.4512e-01,  1.7111e-01,
          -2.4879e-01],
         [ 4.6622e-01,  1.6598e-01, -4.9185e-02,  2.4759e-01,  1.7508e-01,
          -2.4846e-01],
         [ 4.5785e-01,  1.6126e-01, -5.4186e-02,  2.4772e-01,  1.7651e-01,
          -2.4020e-01],
         [ 4.6395e-01,  1.6431e-01, -4.9389e-02,  2.4981e-01,  1.7715e-01,
          -2.4493e-01],
         [ 4.4696e-01,  1.5937e-01, -5.8403e-02,  2.4697e-01,  1.7707e-01,
 

In [10]:
with torch.no_grad():
    print("[model_cpp]")
    print("\n{partial output}")
    out1 = model_cpp(inputs, hidden)
    print(out1)

[model_cpp]

{partial output}
(tensor([[[ 2.1202e-01,  2.8098e-01, -1.6853e-01,  1.5387e-01,  1.8006e-01,
          -1.8520e-01],
         [ 3.6610e-01,  2.3259e-01, -9.6601e-02,  1.8165e-01,  1.6299e-01,
          -2.6788e-01],
         [ 4.3691e-01,  1.9658e-01, -6.7583e-02,  2.0776e-01,  1.5565e-01,
          -2.6871e-01],
         [ 4.6849e-01,  1.7438e-01, -4.7811e-02,  2.3025e-01,  1.5583e-01,
          -2.6347e-01],
         [ 4.6763e-01,  1.6628e-01, -4.6389e-02,  2.3983e-01,  1.6276e-01,
          -2.5412e-01],
         [ 4.6391e-01,  1.6435e-01, -4.9311e-02,  2.4512e-01,  1.7111e-01,
          -2.4879e-01],
         [ 4.6622e-01,  1.6598e-01, -4.9184e-02,  2.4759e-01,  1.7508e-01,
          -2.4846e-01],
         [ 4.5785e-01,  1.6126e-01, -5.4186e-02,  2.4772e-01,  1.7651e-01,
          -2.4020e-01],
         [ 4.6395e-01,  1.6431e-01, -4.9389e-02,  2.4981e-01,  1.7715e-01,
          -2.4493e-01],
         [ 4.4696e-01,  1.5937e-01, -5.8403e-02,  2.4697e-01,  1.7707e-01,
   

In [12]:
with torch.no_grad():
    print(out0[0].sub(out1[0]).abs().sum())

tensor(2.2938e-05, device='cuda:0')


In [13]:
with torch.no_grad():
    print("[model_cuda]")
    print("\n{partial output}")
    out2 = model_cuda(inputs, hidden)
    print(out2)

[model_cuda]

{partial output}
(tensor([[[ 2.1202e-01,  2.8098e-01, -1.6853e-01,  1.5387e-01,  1.8006e-01,
          -1.8520e-01],
         [ 3.6610e-01,  2.3259e-01, -9.6601e-02,  1.8165e-01,  1.6299e-01,
          -2.6788e-01],
         [ 4.3691e-01,  1.9658e-01, -6.7583e-02,  2.0776e-01,  1.5565e-01,
          -2.6871e-01],
         [ 4.6849e-01,  1.7438e-01, -4.7811e-02,  2.3025e-01,  1.5583e-01,
          -2.6347e-01],
         [ 4.6763e-01,  1.6628e-01, -4.6389e-02,  2.3983e-01,  1.6276e-01,
          -2.5412e-01],
         [ 4.6391e-01,  1.6435e-01, -4.9311e-02,  2.4512e-01,  1.7111e-01,
          -2.4879e-01],
         [ 4.6622e-01,  1.6598e-01, -4.9184e-02,  2.4759e-01,  1.7508e-01,
          -2.4846e-01],
         [ 4.5785e-01,  1.6126e-01, -5.4186e-02,  2.4772e-01,  1.7651e-01,
          -2.4020e-01],
         [ 4.6395e-01,  1.6431e-01, -4.9389e-02,  2.4981e-01,  1.7715e-01,
          -2.4493e-01],
         [ 4.4696e-01,  1.5937e-01, -5.8403e-02,  2.4697e-01,  1.7707e-01,
  

In [14]:
with torch.no_grad():
    print(out0[0].sub(out2[0]).abs().sum())

tensor(2.4626e-05, device='cuda:0')


### Backward Gradients
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [23]:
criterion = nn.CrossEntropyLoss()

hidden = (torch.zeros(n_layers, batch_size, hidden_size, device=device, requires_grad=True), torch.zeros(n_layers, batch_size, hidden_size, device=device, requires_grad=True))

inputs, targets = next(iter(fake_loader))
inputs = inputs.to(device)
inputs.requires_grad_()
targets = targets.to(device)

inputs_grads = []
hidden_grads = []
for model in models:
    model.train()
#     model.eval()
    model.zero_grad()
    loss = criterion(model(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1))
    print(loss)
    loss.backward()
    inputs_grads.append(inputs.grad.clone())
    inputs.grad.zero_()
    hidden_grads.append((hidden[0].grad.clone(), hidden[1].grad.clone()))
    hidden[0].grad.zero_()
    hidden[1].grad.zero_()

tensor(1.8188, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8188, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8188, device='cuda:0', grad_fn=<NllLossBackward>)


In [24]:
print("model_torch")
print(inputs_grads[0])
print(hidden_grads[0])
print(model_torch.lstm0.bias.grad)
print(model_torch.lstm0.gamma_cell.grad)
# print(model_torch.lstm0.weight_ih.grad[0 * hidden_size:1 * hidden_size])
# print(model_torch.lstm0.weight_ih.grad[1 * hidden_size:2 * hidden_size])
# print(model_torch.lstm0.weight_ih.grad[2 * hidden_size:3 * hidden_size])
# print(model_torch.lstm0.weight_ih.grad[3 * hidden_size:4 * hidden_size])

model_torch
tensor([[[-2.3392e-03, -2.0866e-03,  4.1625e-03,  1.3977e-03, -6.6969e-04],
         [ 3.0949e-06, -6.4528e-05, -4.0021e-05, -2.5677e-05, -4.5529e-05],
         [-9.9979e-06, -4.5025e-05, -3.8840e-05, -9.1775e-06, -3.5822e-05],
         [-1.3572e-05,  3.8747e-06, -3.8844e-05,  2.1861e-05, -4.7407e-05],
         [-8.0598e-06, -1.4133e-05, -2.3156e-05,  1.3458e-05, -2.8517e-05],
         [-1.5293e-05, -1.5984e-05, -4.8014e-05,  2.4379e-05, -3.9248e-05],
         [ 6.6737e-06,  1.6129e-05, -6.1013e-05,  2.5314e-05, -2.6347e-05],
         [ 6.0097e-06, -8.5343e-06, -1.9613e-05, -3.6052e-06, -2.2111e-05],
         [ 6.9568e-06, -1.2979e-05, -3.1409e-05,  8.0041e-06, -2.7936e-05],
         [ 2.1220e-05,  2.2263e-05, -7.3463e-06, -6.5337e-06, -3.5929e-05],
         [ 1.8278e-05,  2.1283e-05, -1.6896e-05,  7.5421e-06, -1.5575e-05],
         [ 3.8402e-05,  4.7894e-05,  1.6105e-05,  2.6976e-05,  2.1125e-05],
         [ 9.9178e-06,  8.8279e-06, -1.7362e-05, -1.7223e-06, -7.8249e-06],


(tensor([[[ 2.1951e-03, -2.2889e-03,  3.8057e-03,  5.8254e-05, -4.5365e-03,
          -1.5063e-03,  2.5015e-03,  1.9255e-03],
         [ 2.8545e-04, -3.3685e-05, -1.3611e-04, -3.4830e-05, -4.3562e-04,
          -4.0867e-04,  3.3420e-04, -7.6087e-04],
         [-2.6688e-04,  1.8594e-04, -1.4710e-04, -2.4395e-04,  6.4024e-04,
           3.5496e-04, -5.6990e-04,  8.4313e-04],
         [-7.2026e-03,  4.9180e-03,  2.7979e-03,  4.6659e-03,  6.4111e-03,
           1.3186e-02, -1.3843e-02,  6.3364e-03],
         [ 4.7449e-05, -9.9123e-04,  1.7606e-03,  1.0013e-03, -1.7604e-03,
           1.8536e-03, -1.3488e-03,  2.9641e-04],
         [ 8.5704e-04,  1.8461e-03, -5.3673e-05, -7.1377e-04, -5.2805e-04,
          -1.9856e-03,  3.3572e-03, -4.1086e-03],
         [ 8.4167e-04,  9.4750e-04,  4.8508e-04,  9.2901e-04, -1.4106e-03,
          -6.6229e-05,  8.8119e-05, -7.2879e-04],
         [-8.0065e-03,  6.4596e-03,  9.7765e-04, -9.0176e-04,  6.7004e-03,
           4.5004e-03,  4.5472e-03, -3.1279e-03]]

In [25]:
print("model_cpp")
print(inputs_grads[1])
print(hidden_grads[1])
print(model_cpp.lstm0.bias.grad)
print(model_cpp.lstm0.gamma_cell.grad)
# print(model_cpp.lstm0.weight_ih.grad[0 * hidden_size:1 * hidden_size])
# print(model_cpp.lstm0.weight_ih.grad[1 * hidden_size:2 * hidden_size])
# print(model_cpp.lstm0.weight_ih.grad[2 * hidden_size:3 * hidden_size])
# print(model_cpp.lstm0.weight_ih.grad[3 * hidden_size:4 * hidden_size])

model_cpp
tensor([[[-2.3392e-03, -2.0866e-03,  4.1625e-03,  1.3977e-03, -6.6969e-04],
         [ 3.0949e-06, -6.4528e-05, -4.0021e-05, -2.5677e-05, -4.5529e-05],
         [-9.9977e-06, -4.5025e-05, -3.8840e-05, -9.1774e-06, -3.5822e-05],
         [-1.3572e-05,  3.8746e-06, -3.8844e-05,  2.1861e-05, -4.7407e-05],
         [-8.0598e-06, -1.4133e-05, -2.3156e-05,  1.3458e-05, -2.8517e-05],
         [-1.5293e-05, -1.5984e-05, -4.8014e-05,  2.4379e-05, -3.9248e-05],
         [ 6.6738e-06,  1.6129e-05, -6.1014e-05,  2.5314e-05, -2.6347e-05],
         [ 6.0098e-06, -8.5342e-06, -1.9613e-05, -3.6052e-06, -2.2111e-05],
         [ 6.9569e-06, -1.2978e-05, -3.1409e-05,  8.0042e-06, -2.7936e-05],
         [ 2.1220e-05,  2.2263e-05, -7.3463e-06, -6.5337e-06, -3.5929e-05],
         [ 1.8278e-05,  2.1283e-05, -1.6896e-05,  7.5421e-06, -1.5575e-05],
         [ 3.8402e-05,  4.7894e-05,  1.6105e-05,  2.6976e-05,  2.1125e-05],
         [ 9.9178e-06,  8.8280e-06, -1.7362e-05, -1.7223e-06, -7.8249e-06],
  

(tensor([[[ 2.1952e-03, -2.2889e-03,  3.8057e-03,  5.8256e-05, -4.5365e-03,
          -1.5063e-03,  2.5015e-03,  1.9255e-03],
         [ 2.8545e-04, -3.3685e-05, -1.3611e-04, -3.4831e-05, -4.3562e-04,
          -4.0867e-04,  3.3420e-04, -7.6087e-04],
         [-2.6688e-04,  1.8594e-04, -1.4710e-04, -2.4395e-04,  6.4024e-04,
           3.5496e-04, -5.6990e-04,  8.4313e-04],
         [-7.2026e-03,  4.9180e-03,  2.7979e-03,  4.6659e-03,  6.4111e-03,
           1.3186e-02, -1.3843e-02,  6.3364e-03],
         [ 4.7450e-05, -9.9123e-04,  1.7606e-03,  1.0013e-03, -1.7604e-03,
           1.8536e-03, -1.3488e-03,  2.9641e-04],
         [ 8.5704e-04,  1.8461e-03, -5.3675e-05, -7.1377e-04, -5.2805e-04,
          -1.9856e-03,  3.3572e-03, -4.1086e-03],
         [ 8.4167e-04,  9.4750e-04,  4.8508e-04,  9.2901e-04, -1.4106e-03,
          -6.6229e-05,  8.8119e-05, -7.2879e-04],
         [-8.0064e-03,  6.4596e-03,  9.7765e-04, -9.0175e-04,  6.7004e-03,
           4.5003e-03,  4.5472e-03, -3.1279e-03]]

In [26]:
print(model_torch.lstm0.weight_ih.grad.sub(model_cpp.lstm0.weight_ih.grad).abs().sum())

tensor(1.3039e-06, device='cuda:0')


In [27]:
print("model_cuda")
print(inputs_grads[2])
print(hidden_grads[2])
print(model_cuda.lstm0.bias.grad)
print(model_cuda.lstm0.gamma_cell.grad)
# print(model_cuda.lstm0.weight_ih.grad[0 * hidden_size:1 * hidden_size])
# print(model_cuda.lstm0.weight_ih.grad[1 * hidden_size:2 * hidden_size])
# print(model_cuda.lstm0.weight_ih.grad[2 * hidden_size:3 * hidden_size])
# print(model_cuda.lstm0.weight_ih.grad[3 * hidden_size:4 * hidden_size])

model_cuda
tensor([[[-2.3392e-03, -2.0866e-03,  4.1625e-03,  1.3977e-03, -6.6969e-04],
         [ 3.0949e-06, -6.4528e-05, -4.0021e-05, -2.5677e-05, -4.5529e-05],
         [-9.9979e-06, -4.5025e-05, -3.8840e-05, -9.1775e-06, -3.5822e-05],
         [-1.3572e-05,  3.8747e-06, -3.8844e-05,  2.1861e-05, -4.7407e-05],
         [-8.0598e-06, -1.4133e-05, -2.3156e-05,  1.3458e-05, -2.8517e-05],
         [-1.5293e-05, -1.5984e-05, -4.8014e-05,  2.4379e-05, -3.9248e-05],
         [ 6.6737e-06,  1.6129e-05, -6.1013e-05,  2.5314e-05, -2.6347e-05],
         [ 6.0097e-06, -8.5343e-06, -1.9613e-05, -3.6052e-06, -2.2111e-05],
         [ 6.9569e-06, -1.2979e-05, -3.1409e-05,  8.0042e-06, -2.7936e-05],
         [ 2.1220e-05,  2.2263e-05, -7.3463e-06, -6.5337e-06, -3.5929e-05],
         [ 1.8278e-05,  2.1283e-05, -1.6896e-05,  7.5421e-06, -1.5575e-05],
         [ 3.8402e-05,  4.7894e-05,  1.6105e-05,  2.6976e-05,  2.1125e-05],
         [ 9.9178e-06,  8.8279e-06, -1.7362e-05, -1.7223e-06, -7.8249e-06],
 

          -1.2462e-03,  2.8181e-04,  1.5558e-06]]], device='cuda:0'))
tensor([-1.7625e-03,  5.5826e-04,  5.4326e-03,  1.3602e-04, -3.3164e-04,
        -1.6660e-03, -4.3980e-05, -1.6902e-03, -2.6029e-03,  4.1292e-03,
         5.0831e-03, -3.5170e-03, -1.5854e-03, -1.6174e-03,  3.0620e-04,
        -1.2531e-03, -1.4330e-02, -3.6512e-03, -4.0917e-03, -2.1584e-03,
         6.6673e-04,  7.7368e-02, -1.7839e-02, -2.2400e-02, -2.4683e-03,
         1.3155e-03,  3.0737e-03, -1.7665e-03,  1.2936e-03, -3.4525e-04,
        -9.2460e-04, -1.0026e-04], device='cuda:0')
tensor([-0.0162,  0.0099,  0.0270, -0.0138,  0.0018, -0.0085, -0.0032, -0.0068],
       device='cuda:0')


In [28]:
print(model_torch.lstm0.weight_ih.grad.sub(model_cuda.lstm0.weight_ih.grad).abs().sum())

tensor(2.0159e-06, device='cuda:0')


---

## Forward time comparison
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [11]:
dataset_size = 1000 #Test 1000
sequence_length = 500 #Test 20
batch_size = 32 #Test 32

for model in models:
    model.train()
del model

fake_loader = create_fake_loader(dataset_size, sequence_length, batch_size, drop_last=True)

hidden = (torch.zeros(n_layers, batch_size, hidden_size, device=device), torch.zeros(n_layers, batch_size, hidden_size, device=device))

In [18]:
%%timeit -n 1 -r 20
with torch.no_grad():
    for inputs, _ in fake_loader:
        inputs = inputs.to(device)
        model_torch(inputs, hidden)

3.58 s ± 203 ms per loop (mean ± std. dev. of 20 runs, 1 loop each)


In [27]:
%%timeit -n 1 -r 20
with torch.no_grad():
    for inputs, _ in fake_loader:
        inputs = inputs.to(device)
        model_cpp(inputs, hidden)

5.43 s ± 301 ms per loop (mean ± std. dev. of 20 runs, 1 loop each)


In [28]:
%%timeit -n 1 -r 20
with torch.no_grad():
    for inputs, _ in fake_loader:
        inputs = inputs.to(device)
        model_cuda(inputs, hidden)

1.39 s ± 88.6 ms per loop (mean ± std. dev. of 20 runs, 1 loop each)


---

## +Backward time comparison
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [20]:
criterion = nn.CrossEntropyLoss()
"executed"

'executed'

In [46]:
%%timeit -r 20
for inputs, targets in fake_loader:
    inputs = inputs.to(device)
    targets = targets.to(device)
    model_torch.zero_grad()
    criterion(model_torch(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1)).backward()

14.1 s ± 470 ms per loop (mean ± std. dev. of 20 runs, 1 loop each)


In [55]:
%%timeit -r 20
for inputs, targets in fake_loader:
    inputs = inputs.to(device)
    targets = targets.to(device)
    model_cpp.zero_grad()
    criterion(model_cpp(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1)).backward()

10 s ± 414 ms per loop (mean ± std. dev. of 20 runs, 1 loop each)


In [21]:
%%timeit -r 20
for inputs, targets in fake_loader:
    inputs = inputs.to(device)
    targets = targets.to(device)
    model_cuda.zero_grad()
    criterion(model_cuda(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1)).backward()

KeyboardInterrupt: 

---