In [83]:
import time
import torch
import tensor_comprehensions as tc
from torch import nn
from tc_composer.func.function_with_params import FunctionWithParams
from tc_composer.settings import TYPE_NAME

In [2]:
class MLP3(FunctionWithParams):
    def __init__(self, in_n: int, hidden0: int, hidden1: int, out_n: int, out_buffers = ()):
        super(MLP3, self).__init__(out_buffers=out_buffers)
        weight0 = torch.rand(hidden0, in_n)
        weight1 = torch.rand(hidden1, hidden0)
        weight2 = torch.rand(out_n, hidden1)
        bias0 = torch.rand(hidden0)
        bias1 = torch.rand(hidden1)
        bias2 = torch.rand(out_n)
        
        self._params = (weight0, bias0, 
                        weight1, bias1, 
                        weight2, bias2)
    
    @property
    def params(self):
        # A list of params in the same order in which 
        # they are listed in the TC definition
        return self._params

    @property
    def tc_def(self):
        return (
            # Inputs to the function should be listed before weights and biases of the layer, and other params
            # Params should be listed in the same order as `self.params`
            f"def MLP3({TYPE_NAME}(batch_size, in_n) input,\n"
            f"  {TYPE_NAME}(hidden0, in_n) weight0,\n"
            f"  {TYPE_NAME}(hidden0) bias0,\n"
            f"  {TYPE_NAME}(hidden1, hidden0) weight1,\n"
            f"  {TYPE_NAME}(hidden1) bias1,\n"
            f"  {TYPE_NAME}(out_n, hidden1) weight2,\n"
            f"  {TYPE_NAME}(out_n) bias2\n"
            ") -> (output0, output1, output2)\n"
            "{\n"
            "   output0(b, n) +=! input(b, i) * weight0(n, i)\n"
            # Combining point-wise operations with the next linear transformation
            "   output1(b, n) +=! fmax(output0(b, i) + bias0(i), 0) * weight1(n, i)\n"
            "   output2(b, n) +=! fmax(output1(b, i) + bias1(i), 0) * weight2(n, i)\n"
            "   output2(b, n) = fmax(output2(b, n) + bias2(n), 0)\n"
            "}")


In [175]:
batch_size = 32
in_n = 128
hidden_dim = 64

image = torch.rand(batch_size, in_n)

In [4]:
tc_mlp3 = MLP3(in_n=in_n, hidden0=hidden_dim, hidden1=hidden_dim, out_n=hidden_dim)
print(tc_mlp3.tc_def)

def MLP3(float(batch_size, in_n) input,
  float(hidden0, in_n) weight0,
  float(hidden0) bias0,
  float(hidden1, hidden0) weight1,
  float(hidden1) bias1,
  float(out_n, hidden1) weight2,
  float(out_n) bias2
) -> (output0, output1, output2)
{
   output0(b, n) +=! input(b, i) * weight0(n, i)
   output1(b, n) +=! fmax(output0(b, i) + bias0(i), 0) * weight1(n, i)
   output2(b, n) +=! fmax(output1(b, i) + bias1(i), 0) * weight2(n, i)
   output2(b, n) = fmax(output2(b, n) + bias2(n), 0)
}


# Autotune

In [None]:
option = tc_mlp3.tune_options([image], tuner_config=tc.TunerConfig().number_elites(20).generations(40), start_options=option)

[INFO] MLP3 - Appending results to /home/ubuntu/tc_composer/options/MLP3_Tesla_K80


In [176]:
tc_mlp3.recompile(image, option=option)

[INFO] MLP3 - Compiling for input shape - [(32, 128)].


# Correctness

In [140]:
from torch import nn, matmul, relu_ as relu_inplace


def torch_mlp3(inp, params):
    (weight0, bias0, 
    weight1, bias1, 
    weight2, bias2) = params
    
    out0 = relu_inplace(matmul(inp, weight0).add_(bias0))
    out1 = relu_inplace(matmul(out0, weight1).add_(bias1))
    out2 = relu_inplace(matmul(out1, weight2).add_(bias2))
    
    return out2

In [149]:
params = []
for t in tc_mlp3.params:
    if t.dim() > 1:
        t = t.transpose(0, 1).contiguous()
    params.append(t)

In [150]:
import numpy as np

np.testing.assert_allclose(
    tc_mlp3(image)[-1].cpu().detach().numpy(),
    torch_mlp3(image, params).cpu().detach().numpy(),
    rtol=1e-4
)

# Benchmark

In [98]:
def mytime(iters, prepend, runFun, *args):
    timesCPU = []
    timesCPUAndGPU = []
    for i in range(iters):
        torch.cuda.synchronize()
        start = time.clock()
        outputs = runFun(*args)
        timesCPU.append(time.clock() - start)
        torch.cuda.synchronize()
        timesCPUAndGPU.append(time.clock() - start)
    print("#################################################################")
    timesCPU = sorted(timesCPU)
    print(
        "{} Total CPU time to launch kernel: min {}us, p50 {}us, p90 {}us, max {}us".
        format(
            prepend,
            int(timesCPU[0] * 1e6),
            int(timesCPU[int(len(timesCPU) // 2)] * 1e6),
            int(timesCPU[int((len(timesCPU) * 9) // 10)] * 1e6),
            int(timesCPU[len(timesCPU) - 1] * 1e6),
        ))
    timesCPUAndGPU = sorted(timesCPUAndGPU)
    print(
        "{} Total CPU launch + GPU kernel time: min {}us, p50 {}us, p90 {}us, max {}us".
        format(
            prepend,
            int(timesCPUAndGPU[0] * 1e6),
            int(timesCPUAndGPU[int(len(timesCPUAndGPU) // 2)] * 1e6),
            int(timesCPUAndGPU[int((len(timesCPUAndGPU) * 9) // 10)] * 1e6),
            int(timesCPUAndGPU[len(timesCPUAndGPU) - 1] * 1e6),
        ))

In [177]:
inp = (image, *tc_mlp3.params)
cache = tc_mlp3._FunctionWithParams__compilation_cache


mytime(
    10000,
    "raw tuned options\t",
    lambda: cache.unchecked_run('MLP3', inp, out_buffers)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 88us, p50 90us, p90 92us, max 667us
raw tuned options	 Total CPU launch + GPU kernel time: min 130us, p50 133us, p90 135us, max 716us


In [178]:
mytime(
    10000,
    "Pytorch\t",
    lambda: torch_mlp3(image, params)
)

#################################################################
Pytorch	 Total CPU time to launch kernel: min 83us, p50 86us, p90 87us, max 1468us
Pytorch	 Total CPU launch + GPU kernel time: min 99us, p50 115us, p90 151us, max 1488us
