In [1]:
import tc_composer
import torch
from torch import Tensor
from torch import nn
import tensor_comprehensions as tc
from tc_composer.func.affine_transform import FusedAffineTransform, AffineTransform
from tc_composer.func.function_with_params import Composition
from tc_composer.func.activation import Activation

[INFO] tc_composer.settings - Setting default tensor type: torch.cuda.FloatTensor
[INFO] tc_composer.settings - Setting epsilon: 1e-16
[INFO] tc_composer.settings - Input tensor shape checking: False
[INFO] tc_composer.settings - Saving compiled options in: /home/ubuntu/tc_composer/options


In [2]:
batch_size = 2
in_n = 128
hiddens = tuple(range(60, 67))
activations = tuple('relu' for _ in range(len(hiddens)))
inp = torch.randn(batch_size, in_n)

### Three different implementation

In [3]:
fused_aff = FusedAffineTransform(in_n=in_n, hiddens=hiddens, activations=activations)

In [4]:
print(fused_aff.tc_def(inp))

def FusedAffineTransform(
    float(2, 128) input,
    float(60, 128) weight,
    float(60) bias,
    float(61, 60) weight1,
    float(61) bias1,
    float(62, 61) weight2,
    float(62) bias2,
    float(63, 62) weight3,
    float(63) bias3,
    float(64, 63) weight4,
    float(64) bias4,
    float(65, 64) weight5,
    float(65) bias5,
    float(66, 65) weight6,
    float(66) bias6
) -> (
    float(2, 60) output,
    float(2, 61) output1,
    float(2, 62) output2,
    float(2, 63) output3,
    float(2, 64) output4,
    float(2, 65) output5,
    float(2, 66) output6
)
{
    output(b, n) +=! input(b, i) * weight(n, i)
    output1(b, n) +=! fmax((output(b, i) + bias(i)), 0) * weight1(n, i)
    output2(b, n) +=! fmax((output1(b, i) + bias1(i)), 0) * weight2(n, i)
    output3(b, n) +=! fmax((output2(b, i) + bias2(i)), 0) * weight3(n, i)
    output4(b, n) +=! fmax((output3(b, i) + bias3(i)), 0) * weight4(n, i)
    output5(b, n) +=! fmax((output4(b, i) + bias4(i)), 0) * weight5(n, i)
    outp

In [5]:
def aff_yielder():
    in_ns = (in_n,) + hiddens[:-1]
    for i, o, a in zip(in_ns, hiddens, activations):
        yield AffineTransform(i, o)
        yield Activation(a)
tc_aff = Composition(*aff_yielder())

In [6]:
print(tc_aff.tc_def(inp))

def Composition(
    float(2, 128) input1,
    float(60, 128) weight7,
    float(60) bias7,
    float(61, 60) weight8,
    float(61) bias8,
    float(62, 61) weight9,
    float(62) bias9,
    float(63, 62) weight10,
    float(63) bias10,
    float(64, 63) weight11,
    float(64) bias11,
    float(65, 64) weight12,
    float(65) bias12,
    float(66, 65) weight13,
    float(66) bias13
) -> (
    float(2, 60) output7,
    float(2, 60) output8,
    float(2, 61) output9,
    float(2, 61) output10,
    float(2, 62) output11,
    float(2, 62) output12,
    float(2, 63) output13,
    float(2, 63) output14,
    float(2, 64) output15,
    float(2, 64) output16,
    float(2, 65) output17,
    float(2, 65) output18,
    float(2, 66) output19,
    float(2, 66) output20
)
{
    output7(b, n) +=! input1(b, i) * weight7(n, i)
    output7(b, n) = output7(b, n) + bias7(n)
    
    output8(b, i) = fmax(output7(b, i), 0)
    
    output9(b, n) +=! output8(b, i) * weight8(n, i)
    output9(b, n) = output9

In [7]:
params = []
for t in tc_aff.params:
    if t.dim() > 1:
        t = t.transpose(0, 1).contiguous()
    params.append(t)

In [8]:
from torch import nn, matmul, relu_ as relu_inplace

def torch_aff(inp, params=params):
    weights = tuple(w for n,w in enumerate(params) if n % 2 == 0)
    biases = tuple(b for n,b in enumerate(params) if n % 2 == 1)

    out = inp
    for w, b in zip(weights, biases):
        out = relu_inplace(matmul(out, w).add_(b))
    
    return out

In [9]:
for p0, p1 in zip(fused_aff.params, tc_aff.params):
    p0.data = p1.data.view_as(p0)

### Correctness

In [10]:
tc_aff.recompile(inp)
fused_aff.recompile(inp)

[INFO] Composition - Compiling for input shape - [(2, 128)].
[INFO] Composition - Option loaded from file for input shape - [(2, 128)].
[INFO] FusedAffineTransform - Compiling for input shape - [(2, 128)].
[INFO] FusedAffineTransform - Option loaded from file for input shape - [(2, 128)].


In [12]:
import numpy as np

np.testing.assert_allclose(fused_aff(inp).data, tc_aff(inp).data, rtol=1e-4)
np.testing.assert_allclose(torch_aff(inp, params).data, tc_aff(inp).data, rtol=1e-4)

### Benchmark

In [16]:
import time
def mytime(iters, prepend, runFun, *args):
    timesCPU = []
    timesCPUAndGPU = []
    for i in range(iters):
        torch.cuda.synchronize()
        start = time.clock()
        outputs = runFun(*args)
        timesCPU.append(time.clock() - start)
        torch.cuda.synchronize()
        timesCPUAndGPU.append(time.clock() - start)
    print("#################################################################")
    timesCPU = sorted(timesCPU)
    print(
        "{} Total CPU time to launch kernel: min {}us, p50 {}us, p90 {}us, max {}us".
        format(
            prepend,
            int(timesCPU[0] * 1e6),
            int(timesCPU[int(len(timesCPU) // 2)] * 1e6),
            int(timesCPU[int((len(timesCPU) * 9) // 10)] * 1e6),
            int(timesCPU[len(timesCPU) - 1] * 1e6),
        ))
    timesCPUAndGPU = sorted(timesCPUAndGPU)
    print(
        "{} Total CPU launch + GPU kernel time: min {}us, p50 {}us, p90 {}us, max {}us".
        format(
            prepend,
            int(timesCPUAndGPU[0] * 1e6),
            int(timesCPUAndGPU[int(len(timesCPUAndGPU) // 2)] * 1e6),
            int(timesCPUAndGPU[int((len(timesCPUAndGPU) * 9) // 10)] * 1e6),
            int(timesCPUAndGPU[len(timesCPUAndGPU) - 1] * 1e6),
        ))

In [33]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: tc_aff(inp)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 147us, p50 151us, p90 156us, max 298us
raw tuned options	 Total CPU launch + GPU kernel time: min 968us, p50 978us, p90 994us, max 2015us


In [34]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: fused_aff(inp)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 141us, p50 144us, p90 149us, max 695us
raw tuned options	 Total CPU launch + GPU kernel time: min 967us, p50 979us, p90 984us, max 1261us


In [46]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: torch_aff(inp, params)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 222us, p50 226us, p90 230us, max 2113us
raw tuned options	 Total CPU launch + GPU kernel time: min 235us, p50 245us, p90 294us, max 2133us


### Tuning

In [18]:
fused_aff.tune_options([inp], tuner_config=tc.TunerConfig().number_elites(15))
tc_aff.tune_options([inp], tuner_config=tc.TunerConfig().number_elites(15))

[INFO] FusedAffineTransform - Appending results to /home/ubuntu/tc_composer/options/FusedAffineTransform_Tesla_K80
[INFO] Composition - Loading start options from file - /home/ubuntu/tc_composer/options/Composition_Tesla_K80
[INFO] Composition - Option loaded from file for input shape - [(2, 128)].
[INFO] Composition - Appending results to /home/ubuntu/tc_composer/options/Composition_Tesla_K80


<tensor_comprehensions.tclib.MappingOptions at 0x7fd24005cce0>

In [19]:
tc_aff.recompile(inp)
fused_aff.recompile(inp)

[INFO] Composition - Compiling for input shape - [(2, 128)].
[INFO] Composition - Option loaded from file for input shape - [(2, 128)].
[INFO] FusedAffineTransform - Compiling for input shape - [(2, 128)].
[INFO] FusedAffineTransform - Option loaded from file for input shape - [(2, 128)].


In [47]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: tc_aff(inp)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 147us, p50 151us, p90 153us, max 625us
raw tuned options	 Total CPU launch + GPU kernel time: min 184us, p50 188us, p90 190us, max 652us


In [50]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: fused_aff(inp)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 141us, p50 144us, p90 147us, max 741us
raw tuned options	 Total CPU launch + GPU kernel time: min 177us, p50 181us, p90 183us, max 788us


In [42]:
mytime(
    10000,
    "raw tuned options\t",
    lambda: torch_aff(inp, params)
)

#################################################################
raw tuned options	 Total CPU time to launch kernel: min 211us, p50 216us, p90 223us, max 2212us
raw tuned options	 Total CPU launch + GPU kernel time: min 223us, p50 242us, p90 302us, max 2246us
