In [1]:
import tensor_comprehensions as tc
import torch
import tc_composer
from torch.nn import Conv2d
from tc_composer.func.conv import Convolution

[INFO] tc_composer.settings - Setting default tensor type: torch.cuda.FloatTensor
[INFO] tc_composer.settings - Setting epsilon: 1e-16
[INFO] tc_composer.settings - Input tensor shape checking: False
[INFO] tc_composer.settings - Saving compiled options in: /home/ubuntu/tc_composer/options


# gconv

In [2]:
def make_convs(N, G, F, C, W, H):
    in_channels = C
    out_channels = F
    kernel_size = (H//2, W//2)
    stride=(1, 1)
    padding=(0, 0)
    groups = G

    batch_size = N
    in_height = H
    in_width = W 

    tc_image = torch.randn(batch_size, groups, in_channels, in_height, in_width)
    tc_conv = Convolution(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, groups=G)
    
    # pytorch conv
    torch_image = tc_image.view(batch_size, in_channels * groups, in_height, in_width)
    torch_conv = Conv2d(in_channels*groups, out_channels*groups, kernel_size, padding=padding, stride=stride, groups=groups)
    torch_conv.weight.data, torch_conv.bias.data = tc_conv.params[0].data.view_as(torch_conv.weight), \
                                               tc_conv.params[1].data.view_as(torch_conv.bias)
    
    return tc_conv, tc_image, torch_conv, torch_image

# Round 0

In [3]:
tc_conv0, tc_image0, torch_conv0, torch_image0 = make_convs(32, 32, 16, 16, 14, 14)

In [4]:
print(tc_conv0.tc_def(tc_image0))

def Convolution(
    float(32, 32, 16, 14, 14) input,
    float(32, 16, 16, 7, 7) weight,
    float(32, 16) bias
) -> (
    float(32, 32, 16, 7, 7) output
)
{
    output(n, g, m, h, w) +=! input(n, g, c, h + kh, w + kw) * weight(g, m, c, kh, kw) 
        where kh in 0:7, kw in 0:7, h in 0:8, w in 0:8
    output(n, g, m, h, w) = output(n, g, m, h, w) + bias(g, m)
         where h in 0:8, w in 0:8
}


In [33]:
option0 = None
option0 = tc_conv0.tune_options([tc_image0],
                              start_option=option0,
                              tuner_config = tc.TunerConfig().pop_size(150).generations(7).mutation_rate(8).number_elites(30).crossover_rate(40))
option0 = tc_conv0.tune_options([tc_image0],
                              start_option=option0,
                              tuner_config = tc.TunerConfig().pop_size(150).generations(7).mutation_rate(8).number_elites(30).crossover_rate(20))
option0 = tc_conv0.tune_options([tc_image0],
                              start_option=option0,
                              tuner_config = tc.TunerConfig().pop_size(150).generations(3).mutation_rate(8).number_elites(30).crossover_rate(5))
tc_conv0.recompile(tc_image0, option=option0)

[INFO] Convolution - Compiling for input shape - [(32, 32, 16, 14, 14)].


In [73]:
print(option0)

generic_mapping_options {
  outer_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  intra_tile_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  fix_parameters_before_scheduling: true
  tiling {
    sizes: 1
    sizes: 1
    sizes: 1
    sizes: 32
  }
  unroll: 32
  tile_imperfectly_nested: false
  match_library_calls: true
}
block {
  x: 8
  y: 16
}
grid {
  x: 64
  y: 64
  z: 14
}
use_shared_memory: true
use_private_memory: false
unroll_copy_shared: true
use_readonly_cache: false



# Benchmark

In [42]:
import time
# Define a timing function to print some results
def time_tc(iters, prepend, runFun, *args):
    timesCPU = []
    timesCPUAndGPU = []
    for i in range(iters):
        torch.cuda.synchronize()
        start = time.clock()
        outputs = runFun(*args)
        timesCPU.append(time.clock() - start)
        torch.cuda.synchronize()
        timesCPUAndGPU.append(time.clock() - start)
    print("#################################################################")
    timesCPU = sorted(timesCPU)
    print("{} Total CPU time to launch kernel: min {}us, p50 {}us, p90 {}us, max {}us".format(
        prepend,
        int(timesCPU[0] * 1e6),
        int(timesCPU[int(len(timesCPU) // 2)] * 1e6),
        int(timesCPU[int((len(timesCPU) * 9) // 10)] * 1e6),
        int(timesCPU[len(timesCPU) - 1] * 1e6),
    ))
    timesCPUAndGPU = sorted(timesCPUAndGPU)
    print("{} Total CPU launch + GPU kernel time: min {}us, p50 {}us, p90 {}us, max {}us".format(
        prepend,
        int(timesCPUAndGPU[0] * 1e6),
        int(timesCPUAndGPU[int(len(timesCPUAndGPU) // 2)] * 1e6),
        int(timesCPUAndGPU[int((len(timesCPUAndGPU) * 9) // 10)] * 1e6),
        int(timesCPUAndGPU[len(timesCPUAndGPU) - 1] * 1e6),
    ))

In [82]:
time_tc(10000,
        "raw unchecked_run tuned options\t",
        tc_conv0, tc_image0)

#################################################################
raw unchecked_run tuned options	 Total CPU time to launch kernel: min 97us, p50 99us, p90 104us, max 532us
raw unchecked_run tuned options	 Total CPU launch + GPU kernel time: min 723us, p50 736us, p90 740us, max 1211us


In [57]:
torch_conv0 = torch_conv0.cuda()

In [58]:
time_tc(10000,
        "stock\t",
        torch_conv0.forward,
        torch_image0)

#################################################################
stock	 Total CPU time to launch kernel: min 871us, p50 881us, p90 901us, max 1758us
stock	 Total CPU launch + GPU kernel time: min 963us, p50 1064us, p90 1084us, max 1889us


# Round 1

In [74]:
print(option0.mapToBlocks((64, 64, 256)))

generic_mapping_options {
  outer_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  intra_tile_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  fix_parameters_before_scheduling: true
  tiling {
    sizes: 1
    sizes: 1
    sizes: 1
    sizes: 32
  }
  unroll: 32
  tile_imperfectly_nested: false
  match_library_calls: true
}
block {
  x: 8
  y: 16
}
grid {
  x: 64
  y: 64
  z: 256
}
use_shared_memory: true
use_private_memory: false
unroll_copy_shared: true
use_readonly_cache: false



In [60]:
tc_conv1, tc_image1, torch_conv1, torch_image1 = make_convs(32, 32, 32, 32, 7, 7)

In [75]:
# TC Conv
option1 = tc_conv1.tune_options([tc_image1], start_option=option0, tuner_config = tc.TunerConfig().pop_size(150).generations(5).mutation_rate(8).number_elites(30).crossover_rate(10))
tc_conv1.recompile(tc_image1, option=option1)

[INFO] Convolution - Appending results to /home/ubuntu/tc_composer/options/Convolution_Tesla_V100-SXM2-16GB
[INFO] Convolution - Compiling for input shape - [(32, 32, 32, 7, 7)].


In [76]:
time_tc(10000,
        "raw unchecked_run tuned options\t",
        tc_conv1, tc_image1)

#################################################################
raw unchecked_run tuned options	 Total CPU time to launch kernel: min 97us, p50 99us, p90 103us, max 52478us
raw unchecked_run tuned options	 Total CPU launch + GPU kernel time: min 307us, p50 318us, p90 334us, max 52530us


In [63]:
time_tc(10000,
        "stock\t",
        torch_conv1.forward,
        torch_image1)

#################################################################
stock	 Total CPU time to launch kernel: min 281us, p50 288us, p90 297us, max 1212us
stock	 Total CPU launch + GPU kernel time: min 486us, p50 512us, p90 518us, max 1366us


In [77]:
print(option1)

generic_mapping_options {
  outer_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  intra_tile_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  fix_parameters_before_scheduling: true
  tiling {
    sizes: 16
    sizes: 1
    sizes: 1
    sizes: 32
  }
  unroll: 32
  tile_imperfectly_nested: false
  match_library_calls: true
}
block {
  x: 8
  y: 16
}
grid {
  x: 64
  y: 64
  z: 256
}
use_shared_memory: true
use_private_memory: false
unroll_copy_shared: true
use_readonly_cache: false



# Round 3

In [89]:
tc_conv2, tc_image2, torch_conv2, torch_image2 = make_convs(32, 32, 4, 4, 56, 56)
option2 = tc_conv2.tune_options([tc_image2], start_option=option1, tuner_config = tc.TunerConfig().pop_size(150).generations(10).mutation_rate(8).number_elites(30).crossover_rate(40))
tc_conv2.recompile(tc_image2, option=option2)

[INFO] Convolution - Appending results to /home/ubuntu/tc_composer/options/Convolution_Tesla_V100-SXM2-16GB
[INFO] Convolution - Compiling for input shape - [(32, 32, 4, 56, 56)].


In [92]:
time_tc(10000,
        "raw unchecked_run tuned options\t",
        tc_conv2, tc_image2)

#################################################################
raw unchecked_run tuned options	 Total CPU time to launch kernel: min 104us, p50 106us, p90 109us, max 514us
raw unchecked_run tuned options	 Total CPU launch + GPU kernel time: min 9837us, p50 12439us, p90 12531us, max 14591us


In [93]:
time_tc(10000,
        "stock\t",
        torch_conv2.forward,
        torch_image2)

#################################################################
stock	 Total CPU time to launch kernel: min 24462us, p50 24659us, p90 24754us, max 26766us
stock	 Total CPU launch + GPU kernel time: min 34871us, p50 35171us, p90 35284us, max 36760us


In [90]:
print(option2)

generic_mapping_options {
  outer_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  intra_tile_schedule_options {
    fusion_strategy: Max
    allow_skewing: false
    positive_orthant: true
  }
  fix_parameters_before_scheduling: false
  tiling {
    sizes: 1
    sizes: 2
    sizes: 1
    sizes: 0
  }
  unroll: 4
  tile_imperfectly_nested: false
  match_library_calls: false
}
block {
  x: 256
}
grid {
  x: 56
  y: 28
  z: 64
}
use_shared_memory: false
use_private_memory: true
unroll_copy_shared: true
use_readonly_cache: true

