Skip to content

Commit

Permalink
Merge pull request #7934 from leofang/jitify_cache
Browse files Browse the repository at this point in the history
Allow Jitify to only cache CuPy-owned headers
  • Loading branch information
kmaehashi committed Dec 1, 2023
2 parents d9fd6de + 50f0c7f commit 27cafa0
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 62 deletions.
38 changes: 22 additions & 16 deletions cupy/_core/core.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2180,22 +2180,7 @@ cdef inline str _translate_cucomplex_to_thrust(str source):
return ''.join(lines)


cpdef function.Module compile_with_cache(
str source, tuple options=(), arch=None, cachd_dir=None,
prepend_cupy_headers=True, backend='nvrtc', translate_cucomplex=False,
enable_cooperative_groups=False, name_expressions=None,
log_stream=None, bint jitify=False):
if translate_cucomplex:
source = _translate_cucomplex_to_thrust(source)
cupy_header_list.append('cupy/cuComplex_bridge.h')
prepend_cupy_headers = True

if prepend_cupy_headers:
source = _cupy_header + source
if jitify:
source = '#include <cupy/cuda_workaround.h>\n' + source
extra_source = _get_header_source()

cpdef tuple assemble_cupy_compiler_options(tuple options):
for op in options:
if '-std=c++' in op:
if op.endswith('03'):
Expand Down Expand Up @@ -2245,6 +2230,27 @@ cpdef function.Module compile_with_cache(
if _cuda_path is not None:
options += ('-I' + os.path.join(_cuda_path, 'include'),)

return options


cpdef function.Module compile_with_cache(
str source, tuple options=(), arch=None, cachd_dir=None,
prepend_cupy_headers=True, backend='nvrtc', translate_cucomplex=False,
enable_cooperative_groups=False, name_expressions=None,
log_stream=None, bint jitify=False):
if translate_cucomplex:
source = _translate_cucomplex_to_thrust(source)
cupy_header_list.append('cupy/cuComplex_bridge.h')
prepend_cupy_headers = True

if prepend_cupy_headers:
source = _cupy_header + source
if jitify:
source = '#include <cupy/cuda_workaround.h>\n' + source
extra_source = _get_header_source()

options = assemble_cupy_compiler_options(options)

return cuda.compiler._compile_module_with_cache(
source, options, arch, cachd_dir, extra_source, backend,
enable_cooperative_groups=enable_cooperative_groups,
Expand Down
1 change: 1 addition & 0 deletions cupy/cuda/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ def _jitify_prep(source, options, cu_path):
global _jitify_header_source_map_populated
if not _jitify_header_source_map_populated:
from cupy._core import core
jitify._init_module()
jitify._add_sources(core._get_header_source_map())
_jitify_header_source_map_populated = True

Expand Down
2 changes: 2 additions & 0 deletions cupy/cuda/cupy_jitify.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
namespace jitify {
namespace detail {
const char* jitify_ver = _xstr_(CUPY_JITIFY_VERSION_CODE);
const char* cupy_cache_key = _xstr_(CUPY_CACHE_KEY);
} // namespace detail
} // namespace jitify

Expand All @@ -19,6 +20,7 @@ namespace jitify {
namespace detail {

const char* jitify_ver = _xstr_(CUPY_JITIFY_VERSION_CODE);
const char* cupy_cache_key = _xstr_(CUPY_CACHE_KEY);
std::map<std::string, std::string>& get_jitsafe_headers_map();
const int preinclude_jitsafe_headers_count = 0;
const char* preinclude_jitsafe_header_names[] = {};
Expand Down
176 changes: 131 additions & 45 deletions cupy/cuda/jitify.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ from libcpp.map cimport map as cpp_map
from libcpp.string cimport string as cpp_str
from libcpp.vector cimport vector

from cupy.cuda import cub

import atexit
import json
import os
import pickle
import re
import tempfile

from cupy._environment import get_cuda_path
from cupy.cuda import cub


###############################################################################
# Extern
Expand All @@ -32,15 +33,16 @@ cdef extern from 'cupy_jitify.h' namespace "jitify::detail" nogil:
cpp_str*) except +

const char* jitify_ver # set at build time
const char* cupy_cache_key # set at build time


# We need an internal way to invalidate the cache (say, when cuda_workaround.h
# or the CCCL bundle is updated) without having to set the environment variable
# We need an internal way to invalidate the cache (when the warmup_kernel below
# is updated) without having to set the environment variable
# CUPY_DISABLE_JITIFY_CACHE in the CI. This should never be touched by end
# users.
cdef extern from *:
"""
const int build_num = 1;
const int build_num = 2;
"""
const int build_num

Expand All @@ -55,64 +57,114 @@ def get_build_version():
return jitify_ver.decode()


cpdef str get_cuda_version():
# Read CUDART version from header if it exists, otherwise use NVRTC version
# as a proxy.
cdef str cuda_path = get_cuda_path()
cdef str cuda_ver = None

if cuda_path is not None:
try:
with open(
os.path.join(cuda_path,
'include/cuda_runtime_api.h')) as f:
hdr = f.read()
m = re.search(r'#define CUDART_VERSION\s+([0-9]*)', hdr)
if m:
cuda_ver = m.group(1)
except: # noqa:E722
pass

if cuda_ver is None:
# avoid circular dependency
from cupy.cuda.compiler import _get_nvrtc_version
major, minor = _get_nvrtc_version()
cuda_ver = f"{int(major) * 1000 + int(minor) * 10}"

return cuda_ver


# We cache headers found by Jitify. This is initialized with a few built-in
# JIT-safe headers, and expands as needed to help reduce compile time.
cdef cpp_map[cpp_str, cpp_str] cupy_headers
_jitify_cache_dir = None
_jitify_cache_versions = None

# Module-level constants
cdef bint _jitify_init = False
cdef str _jitify_cache_dir = None
cdef str _jitify_cache_versions = None

cpdef _add_sources(dict sources):

cpdef _add_sources(dict sources, bint is_str=False):
cdef str k, v
for hdr_name, hdr_source in sources.items():
cupy_headers[hdr_name] = hdr_source
if is_str:
k = hdr_name
v = hdr_source
cupy_headers[k.encode()] = v.encode()
else: # name/source are raw bytes
cupy_headers[hdr_name] = hdr_source


@atexit.register
def dump_cache():
# Set up a version guard for invalidating the cache. Right now,
# we use the build-time versions of CUB/Jitify.
# TODO(leofang): Parse CUB/Thrust/libcu++ versions at process-
# start time, for enabling CCCL + CuPy developers?
assert _jitify_cache_versions is not None
data = (_jitify_cache_versions, dict(cupy_headers))

cdef inline void dump_cache(cpp_map[cpp_str, cpp_str]& cupy_headers) except*:
# Ensure the directory exists
os.makedirs(_jitify_cache_dir, exist_ok=True)

# Construct a temporary Python dict for serialization
cdef dict data = {}
cdef bytes k, v
for it in cupy_headers:
k = it.first
v = it.second
data[k.decode()] = v.decode()

# Set up a temporary file; it must be under the cache directory so
# that atomic moves within the same filesystem can be guaranteed
with tempfile.NamedTemporaryFile(
dir=_jitify_cache_dir, delete=False) as f:
pickle.dump(data, f)
mode='w', dir=_jitify_cache_dir, delete=False) as f:
json.dump(data, f)
f_name = f.name

# atomic move with the destination guaranteed to be overwritten
os.replace(f_name, f'{_jitify_cache_dir}/jitify.pickle')
os.replace(f_name,
f'{_jitify_cache_dir}/jitify_{_jitify_cache_versions}.json')


# This kernel simply includes commonly used headers in CuPy's codebase
# to populate the Jitify cache. Need to bump build_num if updated.
cdef str warmup_kernel = r"""cupy_jitify_exercise
#include <cupy/cuda_workaround.h>
#include <cuda_fp16.h>
#include <type_traits>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cuda/barrier>
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
extern "C" __global__ void jitify_exercise() { }
"""


cdef inline void _init_cupy_headers_from_cache() except*:
global _jitify_cache_dir
_jitify_cache_dir = os.getenv(
'CUPY_CACHE_DIR', os.path.expanduser('~/.cupy/jitify_cache'))
global _jitify_cache_versions
versions = f"{get_build_version()}_{cub.get_build_version()}_{build_num}"
_jitify_cache_versions = versions
# If this function raises an exception, it would mean the cache is
# invalidated.
assert _jitify_cache_versions is not None

with open(f'{_jitify_cache_dir}/jitify.pickle', 'rb') as f:
data = pickle.load(f)
# Attempt to load from the disk/persistent cache
cdef dict data
with open(
f'{_jitify_cache_dir}/jitify_{_jitify_cache_versions}.json',
'r') as f:
data = json.load(f)

# Any failing sanity check here would mean the cache is invalidated.
assert isinstance(data, tuple)
assert len(data) == 2
cached_versions, cached_headers = data
assert isinstance(cached_versions, str)
assert isinstance(cached_headers, dict)
# Check the version guard for invalidating the cache (see the comment
# in the dump_cache() function).
assert cached_versions == versions
# Populate the cache (cupy_headers)
_add_sources(data, is_str=True)

# Populate the in-memory cache with the disk/persistent cache
_add_sources(cached_headers)
global _jitify_init
_jitify_init = True


cdef inline void _init_cupy_headers_from_scratch() except*:
Expand All @@ -131,8 +183,22 @@ cdef inline void _init_cupy_headers_from_scratch() except*:
# Same for tuple
cupy_headers[b"tuple"] = b"#include <cupy/cuda_workaround.h>\n"

# Compile a dummy kernel to further populate the cache (with bundled
# headers)
# need to defer import to avoid circular dependency
from cupy._core.core import assemble_cupy_compiler_options
cdef tuple options = ('-std=c++11', '-DCUB_DISABLE_BF16_SUPPORT',)
options = assemble_cupy_compiler_options(options)
jitify(warmup_kernel, options)

global _jitify_init
_jitify_init = True

cdef inline void init_cupy_headers() except*:
# Frozen the cache (to not mix in user-provided headers)
dump_cache(cupy_headers)


cdef inline void _init_cupy_headers() except*:
if int(os.getenv('CUPY_DISABLE_JITIFY_CACHE', '0')) == 0:
try:
_init_cupy_headers_from_cache()
Expand All @@ -143,15 +209,35 @@ cdef inline void init_cupy_headers() except*:
_init_cupy_headers_from_scratch()


init_cupy_headers()
cpdef void _init_module() except*:
if _jitify_init:
return

global _jitify_cache_dir
if _jitify_cache_dir is None:
_jitify_cache_dir = os.getenv(
'CUPY_CACHE_DIR', os.path.expanduser('~/.cupy/jitify_cache'))

# Set up a version guard for invalidating the cache. Right now, we use the
# build-time versions of Jitify/CUB (CCCL)/CuPy and run-time versions of
# CuPy and CUDA (cannot use cudaRuntimeGetVersion as we link statically to
# it, but we still need a proxy for CTK).
# TODO(leofang): Parse CUB/Thrust/libcu++ versions at process-
# start time, for enabling CCCL + CuPy developers?
global _jitify_cache_versions
if _jitify_cache_versions is None:
_jitify_cache_versions = (
f"{get_build_version()}_{cub.get_build_version()}_"
f"{get_cuda_version()}_{build_num}_{cupy_cache_key.decode()}")

_init_cupy_headers()


# Use Jitify's internal mechanism to search all included headers, and return
# the modified options and the header mapping (as two lists). This roughly
# follows the constructor of jitify::Program(). The found headers are cached
# to accelerate Jitify's search loop.
cpdef jitify(str code, tuple opt):

# input
cdef cpp_str cuda_source
cdef vector[cpp_str] headers
Expand Down
21 changes: 21 additions & 0 deletions install/cupy_builder/_context.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import argparse
import glob
import hashlib
import os
import sys
from typing import Any, List, Mapping, Optional, Tuple
Expand Down Expand Up @@ -53,6 +55,25 @@ def __init__(

self.features = cupy_builder.get_features(self)

# Calculate cache key for this build
print('Generating cache key from header files...')
include_pattern = os.path.join(
source_root, 'cupy', '_core', 'include', '**')
include_files = [
f for f in sorted(glob.glob(include_pattern, recursive=True))
if os.path.isfile(f)
]
hasher = hashlib.sha1(usedforsecurity=False)
for include_file in include_files:
with open(include_file, 'rb') as f:
hasher.update(include_file.encode())
hasher.update(f.read())
hasher.update(b'\x00')
cache_key = hasher.hexdigest()
print(f'Cache key ({len(include_files)} files '
f'matching {include_pattern}): {cache_key}')
self.cupy_cache_key = cache_key


def parse_args(argv: List[str]) -> Tuple[Any, List[str]]:
parser = argparse.ArgumentParser(add_help=False)
Expand Down
1 change: 1 addition & 0 deletions install/cupy_builder/cupy_setup_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ def make_extensions(ctx: Context, compiler, use_cython):
settings['define_macros'].append(('__HIP_PLATFORM_AMD__', '1'))
# deprecated since ROCm 4.2.0
settings['define_macros'].append(('__HIP_PLATFORM_HCC__', '1'))
settings['define_macros'].append(('CUPY_CACHE_KEY', ctx.cupy_cache_key))

available_modules = []
if no_cuda:
Expand Down
2 changes: 1 addition & 1 deletion install/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ disallow_untyped_defs = True
ignore_errors = False
ignore_missing_imports = True
no_implicit_optional = True
python_version = 3.8
python_version = 3.9
show_error_codes = True
strict_equality = True
strict_optional = True
Expand Down

0 comments on commit 27cafa0

Please sign in to comment.