# Customizing Deep500

In this tutorial, we will create manual versions of three different aspects of Deep500 training: [Operators](#Operators), [Datasets](#Datasets), and [Optimizers](#Optimizers).


After setting the environment, we show how to [combine all components](#Putting-it-All-Together).


In [1]:
import deep500 as d5
from deep500.frameworks import pytorch as d5fw



## Operators

### Python

Creating a Python custom operator is easy - just fill in the `forward` and `backward` functions:

In [2]:
import numpy as np  # for data types

class IPowOp(d5.CustomPythonOp):
    def __init__(self, power):
        # Supply input and output tensor types in constructor
        super(IPowOp, self).__init__([d5.tensordesc.runtime_shape(np.float32)], [d5.tensordesc.runtime_shape(np.float32)])
        
        self.power = power
        assert int(power) == power # integral

    def forward(self, inputs):
        return inputs ** self.power

    def backward(self, grads, fwd_inputs, fwd_outputs):
        return (grads[0] * self.power * 
           (fwd_inputs[0] ** (self.power - 1)))

Now, wrap the operator so that the executor can use it as-is:

In [3]:
fw_pow = d5fw.custom_op(IPowOp(3.0))

In [4]:
import torch
t = torch.tensor([1,2,3,4,5], dtype=torch.float32)
fw_pow(t)

(tensor([  1.,   8.,  27.,  64., 125.]),)

Same for gradients:

In [5]:
from torch.autograd import Variable
v = Variable(t, requires_grad=True)
output, = fw_pow(v)
result = torch.sum(output)
result.backward()
v.grad

tensor([ 3., 12., 27., 48., 75.])

### C++

Operators can also be defined in C++ almost as easily as in Python, but some extra steps need to be taken to compile the code.

First, we define the C++ source code as an inline string (files are also supported):

In [6]:
cpp_src = """
#include <deep500/deep500.h>   // Main include file
#include <cmath>

// Operator class definition
template<typename T>
class ipowop : public deep500::CustomOperator {
protected:
    int m_len;
public:
    ipowop(size_t len) : m_len((int)len) {}
    virtual ~ipowop() {}
    virtual bool supports_cuda() {return false;}

    // Forward part
    void forward(const T *input, T *output) {
        #pragma omp parallel for
        for (int i = 0; i < m_len; ++i)
            output[i] = std::pow(input[i], DPOWER);
    }

    // Backward part
    void backward(const T *nextop_grad,
                  const T *fwd_input_tensor,
                  const T *fwd_output_tensor,
                  T *input_tensor_grad) {
        #pragma omp parallel for
        for (int i = 0; i < m_len; ++i) {
            input_tensor_grad[i] = nextop_grad[i] * DPOWER * 
                std::pow(fwd_input_tensor[i], DPOWER - 1);
        }
    }
};

// Function called with runtime sizes to create the operator
D500_EXPORTED void *create_new_op(deep500::tensor_t *input_descriptors, int num_inputs,
                                  deep500::tensor_t *output_descriptors, int num_outputs) {
    size_t totalsz = 1;
    for (int i = 0; i < input_descriptors[0].dims; ++i)
        totalsz *= input_descriptors[0].sizes[i];
    return new ipowop<float>(totalsz);
}

// Register the operator
D500_REGISTER_OP(ipowop<float>);
"""

In [7]:
# Define the operator descriptor
opdesc = d5.compile_custom_cppop_inline('ipowop', cpp_src,
                                        # Input tensor shapes
                                        [d5.tensordesc(np.float32, [5])],
                                        # Output tensor shapes
                                        [d5.tensordesc(np.float32, [5])],
                                        live_output=True,
                                        # Running on CPU
                                        is_cuda=False,
                                        additional_definitions={'DPOWER': '3'})
# Compile and get a handle to the operator
fwcpp_op = d5fw.custom_op(opdesc)

-- Selecting Windows SDK version 10.0.17763.0 to target Windows 10.0.18362.
-- Configuring done
-- Generating done
-- Build files have been written to: C:/Users/XL/Desktop/university/eurompi19/ipowop_pytorch_build

Microsoft (R) Build Engine version 15.9.21+g9802d43bc3 for .NET Framework
Copyright (C) Microsoft Corporation. All rights reserved.
  pytorch.cpp
  C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\include\vcruntime_exception.h(44): note: see declaration of 'std::exception'
  C:\Users\XL\miniconda3\envs\denv\lib\site-packages\torch\include\c10/util/Exception.h(28): note: see declaration of 'c10::Error'
          with
          [
              _Ty=std::string
          ]
  C:\Users\XL\miniconda3\envs\denv\lib\site-packages\torch\include\c10/util/Exception.h(29): note: see declaration of 'std::vector<std::string,std::allocator<_Ty>>'
          with
          [
              _Ty=std::string
          ]
  C:\Program Files (x86)\Microsoft Vis




Similarly to Python, the function is now a native PyTorch operator and can be used normally:

In [8]:
fwcpp_op(torch.tensor([5., 4., 3., 2., 1.], dtype=torch.float32))

(tensor([125.,  64.,  27.,   8.,   1.]),)

### CUDA

Works exactly the same as the C++ version, but with a different flag set and `forward/backward_cuda` functions:

In [9]:
cuda_src = """
#include <deep500/deep500.h>   // Main include file
#include <cmath>

// Kernels
template<typename T>
__global__ void fwd_kernel(const T* inp, T *out, int n) {
    int tid = blockIdx.x * 32 + threadIdx.x;
    if (tid >= n) return;
    out[tid] = std::pow(inp[tid], DPOWER) + T(1); // Adding term to CUDA kernel to ensure it ran
}

template<typename T>
__global__ void bwd_kernel(const T* gradin, const T* fwdin, T *gradout, int n) {
    int tid = blockIdx.x * 32 + threadIdx.x;
    if (tid >= n) return;
    gradout[tid] = gradin[tid] * DPOWER * std::pow(fwdin[tid], DPOWER - 1);
}

// Operator class definition
template<typename T>
class ipowop2 : public deep500::CustomOperator {
protected:
    int m_len;
public:
    ipowop2(size_t len) : m_len((int)len) {}
    virtual ~ipowop2() {}
    virtual bool supports_cuda() {return true;}   // NOTE: This is now true!

    // Forward part (CPU)
    void forward(const T *input, T *output) {
        #pragma omp parallel for
        for (int i = 0; i < m_len; ++i)
            output[i] = std::pow(input[i], DPOWER);
    }

    // Backward part (CPU)
    void backward(const T *nextop_grad,
                  const T *fwd_input_tensor,
                  const T *fwd_output_tensor,
                  T *input_tensor_grad) {
        #pragma omp parallel for
        for (int i = 0; i < m_len; ++i) {
            input_tensor_grad[i] = nextop_grad[i] * DPOWER * 
                std::pow(fwd_input_tensor[i], DPOWER - 1);
        }
    }

    // Forward part (GPU, invokes a kernel)
    void forward_cuda(const T *input, T *output) {
        fwd_kernel<T> <<<(m_len + 32 - 1) / 32, 32>>>(input, output, m_len);
    }

    // Backward part (GPU, invokes a kernel)
    void backward_cuda(const T *nextop_grad,
                  const T *fwd_input_tensor,
                  const T *fwd_output_tensor,
                  T *input_tensor_grad) {
        bwd_kernel<T> <<<(m_len + 32 - 1) / 32, 32>>>(nextop_grad, fwd_input_tensor, 
                                                      input_tensor_grad, m_len);
    }
};

// Function called with runtime sizes to create the operator
D500_EXPORTED void *create_new_op(deep500::tensor_t *input_descriptors, int num_inputs,
                                  deep500::tensor_t *output_descriptors, int num_outputs) {
    size_t totalsz = 1;
    for (int i = 0; i < input_descriptors[0].dims; ++i)
        totalsz *= input_descriptors[0].sizes[i];
    return new ipowop2<float>(totalsz);
}

// Register the operator
D500_REGISTER_OP(ipowop2<float>);
"""

In [10]:
# Define the operator descriptor
opdesc = d5.compile_custom_cppop_inline('ipowop2', cuda_src,
                                        # Input tensor shapes
                                        [d5.tensordesc(np.float32, [5])],
                                        # Output tensor shapes
                                        [d5.tensordesc(np.float32, [5])],
                                        live_output=True,
                                        # Running on GPU
                                        is_cuda=True,
                                        additional_definitions={'DPOWER': '3'})
# Compile and get a handle to the operator
fwcuda_op = d5fw.custom_op(opdesc)

-- Selecting Windows SDK version 10.0.17763.0 to target Windows 10.0.18362.
-- Configuring done
-- Generating done
-- Build files have been written to: C:/Users/XL/Desktop/university/eurompi19/ipowop2_pytorch_build

Microsoft (R) Build Engine version 15.9.21+g9802d43bc3 for .NET Framework
Copyright (C) Microsoft Corporation. All rights reserved.
  Building NVCC (Device) object CMakeFiles/ipowop2.dir/RelWithDebInfo/ipowop2_generated_pytorch.cu.obj
  pytorch.cu
  
  
  
  
  
  
  
  
  
  
  
  
  
  (454): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83): here; dllexport assumed
  
  (83)




In [11]:
fwcuda_op(torch.tensor([5., 4., 3., 2., 1.], dtype=torch.float32).cuda())

(tensor([126.,  65.,  28.,   9.,   2.], device='cuda:0'),)

If the result is correct, the output should be $x^3 + 1$

## Datasets

The dataset interface only consists of a `get(index)` method. Implementing a new dataset format, or a synthetic dataset, is possible by providing the functionality at that level. If a `Sampler` class loads using multiple threads or processes, the dataset will be called multiple times. It only needs to take care of decoding/generating data, rather than loading it.

In [12]:
import random

class MyDataset(d5.Dataset):
    def __init__(self, input_node, label_node):
        super().__init__()
        self.input_node = input_node
        self.label_node = label_node
        
    # There are three possible inputs to __getitem__: Integer element, a slice (e.g., 0:N:2), or a list of elements
    def __getitem__(self, index):
        if isinstance(index, int): # One element
            num = np.array([random.random()], np.float32)
        else: # Minibatch
            if isinstance(index, slice): # Slice
                length = len(range(*index.indices(len(self))))
            else: # List of elements
                length = len(index)
            num = np.random.rand(length, 1).astype(np.float32)
            
        # In any case, return a mapping (random number -> 3 * (random number ^ 3)
        return {self.input_node: num, self.label_node: 3 * (num ** 3)}

    def __len__(self):
        return 1000 # Bogus number to determine epoch length

Example usage:

In [13]:
ds = MyDataset('dummy_input', 'dummy_target')
print('Element 0:', ds[0])
print('Element 0 (again):', ds[0])
print('Elements 1 to 50, skipping 10:', ds[1:50:10])

Element 0: {'dummy_input': array([0.80738384], dtype=float32), 'dummy_target': array([1.5789247], dtype=float32)}
Element 0 (again): {'dummy_input': array([0.44469085], dtype=float32), 'dummy_target': array([0.26381278], dtype=float32)}
Elements 1 to 50, skipping 10: {'dummy_input': array([[0.6502744 ],
       [0.9222002 ],
       [0.54256546],
       [0.50194806],
       [0.7920214 ]], dtype=float32), 'dummy_target': array([[0.82491887],
       [2.3528643 ],
       [0.47915685],
       [0.37940025],
       [1.4905001 ]], dtype=float32)}


## Optimizers

Creating a custom optimizer only requires providing an implementation of a `deep500.Optimizer` subclass. In this case, we can extend the reference SGD optimizer to contain "Stochastic Learning Rate", in which we simply randomize it at every step:

In [14]:
import numpy as np
from deep500.frameworks import reference as d5ref

class StochasticUpdateRule(d5ref.GradientDescent):
    def update_rule(self, grad, old_param, param_name):
        return old_param - (np.random.random_sample() * self.lr) * grad

# Putting it All Together

We can now run our custom model in a native PyTorch environment, reading from the synthetic dataset, and optimizing using the Stochastic Update Rule:

In [15]:
# Define our custom native PyTorch model
from torch import nn
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.pow_op = fwcuda_op
        self.linear = nn.Linear(1, 1)

    def forward(self, inp):
        x, = self.pow_op(inp)
        x = x[0:inp.shape[0]].reshape(inp.shape)  # reshape to right size
        return self.linear(x - 1)  # "x - 1" negates CUDA kernel

model = MyNet()

# Set a native graph executor with our module and MSE loss
executor = d5fw.PyTorchNativeGraphExecutor(module=model, loss=nn.MSELoss(), device=d5.GPUDevice())

In [16]:
# Create the data necessary for recipe-less training
train_set = MyDataset(executor.innode, executor.labelnode)
val_set = MyDataset(executor.innode, executor.labelnode)
optimizer = StochasticUpdateRule(executor, lr=0.1, loss=executor.lossnode)

In [22]:
# Reset parameters
with torch.no_grad():
    model.linear.weight.data[:] = 1
    model.linear.bias.data[:] = 1
          
# Train (batch size must match compiled kernel)
d5.test_training(executor, train_set, val_set, optimizer, epochs=5, batch_size=5, metrics=[])

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




[]

We can now inspect the parameters (should be close to $3,0$):

In [23]:
model.linear.weight, model.linear.bias

(Parameter containing:
 tensor([[2.9990]], device='cuda:0', requires_grad=True), Parameter containing:
 tensor([0.0003], device='cuda:0', requires_grad=True))