Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support TensorComprehensions #1122

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/TensorComprehensions"]
path = third_party/TensorComprehensions
url = https://github.com/facebookresearch/TensorComprehensions.git
1 change: 1 addition & 0 deletions cupy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ def isscalar(num):
from cupy.core import ElementwiseKernel # NOQA
from cupy.core import RawKernel # NOQA
from cupy.core import ReductionKernel # NOQA
from cupy.core import TCKernel # NOQA

# -----------------------------------------------------------------------------
# DLPack
Expand Down
1 change: 1 addition & 0 deletions cupy/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,4 @@
from cupy.core.internal import complete_slice # NOQA
from cupy.core.internal import get_size # NOQA
from cupy.core.raw import RawKernel # NOQA
from cupy.core.tc import TCKernel # NOQA
156 changes: 156 additions & 0 deletions cupy/core/tc.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from libcpp cimport bool
from libcpp.functional cimport function
from libcpp.string cimport string
from libcpp.vector cimport vector
from libcpp.unordered_map cimport unordered_map
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t

from cupy.core.dlpack cimport DLTensor
from cupy.core.dlpack cimport DLManagedTensor


cdef extern from "<chrono>" namespace "std::chrono::high_resolution_clock":

cdef cppclass duration:
pass


cdef extern from "tc/core/execution_engine.h" namespace "tc::ExecutionEngine":

cdef cppclass ExecutorInfo:
pass


ctypedef bool f(const ExecutorInfo*)
ctypedef function[f] prunning_ftype


cdef extern from "tc/core/execution_engine.h" namespace "tc":

cdef cppclass ExecutionEngine:

ExecutionEngine() except +
void define(const string& language)
size_t compile(
const string& name,
const vector[const DLTensor*]& inputs,
const MappingOptions& options)
vector[const DLTensor*] inferOutputTensorInfo(
const string& name,
const vector[const DLTensor*]& inTensorPtrs)
duration run(
size_t handle,
const vector[const DLTensor*]& inputs,
const vector[DLTensor*]& outputs,
bool profile,
prunning_ftype prunningFunction)


cdef extern from "mapping_options.pb.h" namespace "tc":

cdef cppclass MappingOptionsProto:

MappingOptionsProto() except +
size_t ByteSizeLong()
string SerializeAsString()
bool IsInitialized()


cdef extern from "tc/core/mapping_options.h" namespace "tc":

cdef cppclass MappingOptions:

MappingOptions(const string& str)
string toProtobufSerializedString()

@staticmethod
MappingOptions makeNaiveMappingOptions()

@staticmethod
MappingOptions makeSingleThreadMappingOptions()

@staticmethod
MappingOptions makePointwiseMappingOptions()

@staticmethod
MappingOptions makeMlpMappingOptions()

@staticmethod
MappingOptions makeConvolutionMappingOptions()

@staticmethod
MappingOptions makeGroupConvolutionMappingOptions()

MappingOptionsProto proto


cdef extern from "llvm/ADT/Optional.h" namespace "llvm":

cdef cppclass Optional[T]:

T* getPointer()
T& getValue()


cdef extern from "tc/autotuner/parameters.h" namespace "tc::autotune":

cdef cppclass TuningParameterFixer:
pass


ctypedef vector[const DLTensor*] ConstDLTensorVec
ctypedef vector[DLTensor*] DLTensorVec


cdef extern from "tc/autotuner/genetic_autotuner.h" namespace "tc::autotune::detail":

cdef cppclass GeneticAutotuner:

GeneticAutotuner(const string& tc) except +
void storeCaches(const string& filename)

vector[MappingOptions] load(
const string& cacheFileName,
const string& tcName,
const vector[const DLTensor*]& inputs,
const size_t numCandidates)

Optional[MappingOptions] tune(
const string& cacheFileName,
const string& tcName,
const unordered_map[size_t, ConstDLTensorVec]& inputs,
unordered_map[size_t, DLTensorVec]& outputs,
MappingOptions baseMapping,
vector[MappingOptions] startingPoints,
const TuningParameterFixer& fixedParams)


cdef extern from "tc/core/flags.h" namespace "tc":

cdef uint32_t FLAGS_tuner_gen_pop_size
cdef uint32_t FLAGS_tuner_gen_crossover_rate
cdef uint32_t FLAGS_tuner_gen_mutation_rate
cdef uint32_t FLAGS_tuner_gen_generations
cdef uint32_t FLAGS_tuner_gen_number_elites
cdef uint32_t FLAGS_tuner_threads
cdef string FLAGS_tuner_gpus
cdef bool FLAGS_tuner_print_best
cdef string FLAGS_tuner_proto
cdef string FLAGS_tuner_rng_restore
cdef bool FLAGS_tuner_gen_restore_from_proto
cdef uint32_t FLAGS_tuner_gen_restore_number
cdef bool FLAGS_tuner_gen_log_generations
cdef uint64_t FLAGS_tuner_min_launch_total_threads


cdef extern from "tc/autotuner/genetic_tuning_harness.h" namespace "tc::autotune::detail":

vector[size_t] parseGpus()


cdef extern from "tc/autotuner/parameters.h" namespace "tc::autotune":

cdef cppclass TuningParameterFixer:

void fromMappingOptions(const MappingOptions& options)
Loading