In [1]:
import time
import torch
from tc_composer.func.function_with_params import FunctionWithParams
from tc_composer.unique_name import TensorName

[INFO] tc_composer.settings - Setting default tensor type: torch.cuda.FloatTensor
[INFO] tc_composer.settings - Setting epsilon: 1e-16
[INFO] tc_composer.settings - Input tensor shape checking: False
[INFO] tc_composer.settings - Saving compiled options in: /home/ubuntu/tc_composer/options


In [2]:
class MLP3(FunctionWithParams):
    def __init__(self, in_n: int, hidden0: int, hidden1: int, out_n: int):
        in_name = TensorName(dim=2, sizes='B N'.split(), prefix='input')
        output0 = TensorName(dim=2, sizes='B H'.split(), prefix='output')
        output1 = TensorName(dim=2, sizes='B H'.split(), prefix='output')
        output2 = TensorName(dim=2, sizes='B O'.split(), prefix='output')
        in_name.sizes[1] = in_n
        super(MLP3, self).__init__(
            # The order in which names are listed does not matter, as long as 
            # they are retrieved correctly in self.def_body 
            in_names=[in_name], 
            outs_to_keep=[output2],
            outs_to_discard=(output0, output1)
        )
        self.in_n = in_n
        self.hidden0 = hidden0
        self.hidden1 = hidden1
        self.out_n = out_n
        
    @property
    def named_params(self):
        # The order in which names are listed does not matter, as long as 
            # they are retrieved correctly in self.def_body
        return TensorName.make_pair(sizes=(self.hidden0, in_n), prefix='weight'), \
               TensorName.make_pair(sizes=(self.hidden0,), prefix='bias'), \
               TensorName.make_pair(sizes=(self.hidden1, self.hidden0), prefix='weight'), \
               TensorName.make_pair(sizes=(self.hidden1,), prefix='bias'), \
               TensorName.make_pair(sizes=(self.out_n, self.hidden1), prefix='weight'), \
               TensorName.make_pair(sizes=(self.out_n,), prefix='bias'), \

    @property
    def def_body(self):
        input, = self.in_names
        output0, output1, output2 = *self.outs_to_discard, *self.outs_to_keep
        (weight0, bias0,
         weight1, bias1,
         weight2, bias2) = tuple(n for n, _ in self.named_params)
        
        return (
            f"{output0}(b, n) +=! {input}(b, i) * {weight0}(n, i)\n"
            # Combining point-wise operations with the next linear transformation
            f"{output1}(b, n) +=! fmax({output0}(b, i) + {bias0}(i), 0) * {weight1}(n, i)\n"
            f"{output2}(b, n) +=! fmax({output1}(b, i) + {bias1}(i), 0) * {weight2}(n, i)\n"
            f"{output2}(b, n) = fmax({output2}(b, n) + {bias2}(n), 0)")


In [3]:
batch_size = 32
in_n = 128
hidden_dim = 64

image = torch.rand(batch_size, in_n)

In [4]:
tc_mlp3 = MLP3(in_n=in_n, hidden0=hidden_dim, hidden1=hidden_dim, out_n=hidden_dim)
print(tc_mlp3.tc_def)

def MLP3(
    float(B,128) input,
    float(64,128) weight,
    float(64) bias,
    float(64,64) weight1,
    float(64) bias1,
    float(64,64) weight2,
    float(64) bias2
) -> (output, output1, output2)
{
    output(b, n) +=! input(b, i) * weight(n, i)
    output1(b, n) +=! fmax(output(b, i) + bias(i), 0) * weight1(n, i)
    output2(b, n) +=! fmax(output1(b, i) + bias1(i), 0) * weight2(n, i)
    output2(b, n) = fmax(output2(b, n) + bias2(n), 0)
}


# Autotune

In [None]:
# Takes a long time
option = tc_mlp3.tune_options([image])

[INFO] MLP3 - Loading start options from file - /home/ubuntu/tc_composer/options/MLP3_Tesla_K80


In [13]:
tc_mlp3.recompile(image, option=option)

[INFO] MLP3 - Compiling for input shape - [(32, 128)].


# Correctness

In [14]:
from torch import nn, matmul, relu_ as relu_inplace


def torch_mlp3(inp, params):
    (weight0, bias0, 
    weight1, bias1, 
    weight2, bias2) = params
    
    out0 = relu_inplace(matmul(inp, weight0).add_(bias0))
    out1 = relu_inplace(matmul(out0, weight1).add_(bias1))
    out2 = relu_inplace(matmul(out1, weight2).add_(bias2))
    
    return out2

In [15]:
params = []
for t in tc_mlp3.params:
    if t.dim() > 1:
        t = t.transpose(0, 1).contiguous()
    params.append(t)

In [16]:
import numpy as np

np.testing.assert_allclose(
    tc_mlp3(image)[-1].cpu().detach().numpy(),
    torch_mlp3(image, params).cpu().detach().numpy(),
    rtol=1e-4
)

# Benchmark

In [17]:
def mytime(iters, prepend, runFun, *args):
    timesCPU = []
    timesCPUAndGPU = []
    for i in range(iters):
        torch.cuda.synchronize()
        start = time.clock()
        outputs = runFun(*args)
        timesCPU.append(time.clock() - start)
        torch.cuda.synchronize()
        timesCPUAndGPU.append(time.clock() - start)
    print("#################################################################")
    timesCPU = sorted(timesCPU)
    print(
        "{} Total CPU time to launch kernel: min {}us, p50 {}us, p90 {}us, max {}us".
        format(
            prepend,
            int(timesCPU[0] * 1e6),
            int(timesCPU[int(len(timesCPU) // 2)] * 1e6),
            int(timesCPU[int((len(timesCPU) * 9) // 10)] * 1e6),
            int(timesCPU[len(timesCPU) - 1] * 1e6),
        ))
    timesCPUAndGPU = sorted(timesCPUAndGPU)
    print(
        "{} Total CPU launch + GPU kernel time: min {}us, p50 {}us, p90 {}us, max {}us".
        format(
            prepend,
            int(timesCPUAndGPU[0] * 1e6),
            int(timesCPUAndGPU[int(len(timesCPUAndGPU) // 2)] * 1e6),
            int(timesCPUAndGPU[int((len(timesCPUAndGPU) * 9) // 10)] * 1e6),
            int(timesCPUAndGPU[len(timesCPUAndGPU) - 1] * 1e6),
        ))

In [18]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: tc_mlp3(image)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 117us, p50 119us, p90 121us, max 679us
raw tuned options	 Total CPU launch + GPU kernel time: min 149us, p50 153us, p90 154us, max 694us


In [19]:
mytime(
    10000,
    "Pytorch\t",
    lambda: torch_mlp3(image, params)
)

#################################################################
Pytorch	 Total CPU time to launch kernel: min 90us, p50 94us, p90 99us, max 1800us
Pytorch	 Total CPU launch + GPU kernel time: min 103us, p50 120us, p90 152us, max 1817us
