Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROCm support #3462

Closed
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
set(FAISS_LANGUAGES CXX)

if(FAISS_ENABLE_GPU)
list(APPEND FAISS_LANGUAGES CUDA)
# if ROCm install detected, assume ROCm/HIP is GPU device
if (EXISTS /opt/rocm)
set(USE_ROCM TRUE)
list(APPEND FAISS_LANGUAGES HIP)
else()
list(APPEND FAISS_LANGUAGES CUDA)
endif()
endif()

if(FAISS_ENABLE_RAFT)
Expand Down Expand Up @@ -58,8 +64,13 @@ option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
option(FAISS_ENABLE_C_API "Build C API." OFF)

if(FAISS_ENABLE_GPU)
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
if(USE_ROCM)
enable_language(HIP)
add_definitions(-DUSE_ROCM)
else ()
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
endif()
endif()

if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
Expand All @@ -69,7 +80,11 @@ if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
add_subdirectory(faiss)

if(FAISS_ENABLE_GPU)
add_subdirectory(faiss/gpu)
if(USE_ROCM)
add_subdirectory(faiss/gpu-rocm)
else()
add_subdirectory(faiss/gpu)
endif()
endif()

if(FAISS_ENABLE_PYTHON)
Expand All @@ -90,6 +105,10 @@ if(BUILD_TESTING)
add_subdirectory(tests)

if(FAISS_ENABLE_GPU)
add_subdirectory(faiss/gpu/test)
if(USE_ROCM)
add_subdirectory(faiss/gpu-rocm/test)
else()
add_subdirectory(faiss/gpu/test)
endif()
endif()
endif()
6 changes: 5 additions & 1 deletion c_api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,9 @@ add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
target_link_libraries(example_c PRIVATE faiss_c)

if(FAISS_ENABLE_GPU)
add_subdirectory(gpu)
if(USE_ROCM)
add_subdirectory(gpu-rocm)
else ()
add_subdirectory(gpu)
endif()
endif()
6 changes: 6 additions & 0 deletions c_api/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@ target_sources(faiss_c PRIVATE
file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)

if (USE_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
else()
find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
endif()

add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
target_link_libraries(example_gpu_c PRIVATE faiss_c)
25 changes: 23 additions & 2 deletions faiss/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,16 @@ function(generate_ivf_interleaved_code)
"64|2048|8"
)

if(USE_ROCM)
set(CU_OR_HIP "hip")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you name it something like GPU_EXT_PREFIX?

else()
set(CU_OR_HIP "cu")
endif()

if (USE_ROCM)
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
endif()

# Traverse through the Cartesian product of X and Y
foreach(sub_codec ${SUB_CODEC_TYPE})
foreach(metric_type ${SUB_METRIC_TYPE})
Expand All @@ -210,10 +220,10 @@ function(generate_ivf_interleaved_code)
set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
# Remove illegal characters from filename
string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu")
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${CU_OR_HIP}")

# Read the template file
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content)
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.${CU_OR_HIP}" template_content)

# Replace the placeholders
string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
Expand Down Expand Up @@ -290,6 +300,10 @@ if(FAISS_ENABLE_RAFT)
target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
endif()

if (USE_ROCM)
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
endif()

# Export FAISS_GPU_HEADERS variable to parent scope.
set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)

Expand All @@ -304,6 +318,12 @@ foreach(header ${FAISS_GPU_HEADERS})
)
endforeach()

if (USE_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
target_link_libraries(faiss_gpu PRIVATE hip::host roc::hipblas)
target_compile_options(faiss_gpu PRIVATE)
else()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. indentation
  2. Can we please move find_package statements to the top-level CMakeLists.txt guarded by USE_ROCM?
  3. For target_link_libraries, can we please use the condition syntax like we do for RAFT? $<$<BOOL:${USE_ROCM}>:hip::host>

# Prepares a host linker script and enables host linker to support
# very large device object files.
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
Expand All @@ -322,3 +342,4 @@ target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
endif()
2 changes: 1 addition & 1 deletion faiss/gpu/GpuFaissAssert.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
/// Assertions
///

#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#define GPU_FAISS_ASSERT(X) assert(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
Expand Down
4 changes: 2 additions & 2 deletions faiss/gpu/StandardGpuResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,8 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {

// Our code is pre-built with and expects warpSize == 32, validate that
FAISS_ASSERT_FMT(
prop.warpSize == 32,
"Device id %d does not have expected warpSize of 32",
prop.warpSize == 32 || prop.warpSize == 64,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this ROCm specific? If so, can we allow 64 only for ROCm?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have both wavefront 32 (E.g. navi) and 64 (E.g. MI250) devices. So this offers support for both.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It sounds like Nvidia is 32 only and ROCm is 32 or 64. Should we lock it accordingly in code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is desired, I could rework that assert using a ROCm flag to only allow a warpSize of 64 (and 32) on ROCm devices. It shouldn't be an issue at all!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think I would do that.

"Device id %d does not have expected warpSize of 32 or 64",
device);

// Create streams
Expand Down
212 changes: 212 additions & 0 deletions faiss/gpu/hipify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
#!/bin/bash

# go one level up from faiss/gpu
top=$(dirname "${BASH_SOURCE[0]}")/..
echo "top=$top"
cd $top
echo "pwd=`pwd`"

# create all destination directories for hipified files into sibling 'gpu-rocm' directory
for src in $(find ./gpu -type d)
do
dst=$(echo $src | sed 's/gpu/gpu-rocm/')
echo "Creating $dst"
mkdir -p $dst
done

# run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
# run all files in parallel to speed up
for ext in cu cuh h cpp
do
for src in $(find ./gpu -name "*.$ext")
do
dst=$(echo $src | sed 's@./gpu@./gpu-rocm@')
hipify-perl -o=$dst.tmp $src &
done
done
wait

# rename all hipified *.cu files to *.hip
for src in $(find ./gpu-rocm -name "*.cu.tmp")
do
dst=${src%.cu.tmp}.hip.tmp
mv $src $dst
done

# replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
# replace thrust::cuda::par with thrust::hip::par
# adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
# adjust header path location for hiprand_kernel.h to avoid unnecessary deprecation warnings
for ext in hip cuh h cpp
do
for src in $(find ./gpu-rocm -name "*.$ext.tmp")
do
sed -i 's@#include <faiss/gpu/@#include <faiss/gpu-rocm/@' $src
sed -i 's@thrust::cuda::par@thrust::hip::par@' $src
sed -i 's@#include <hipblas.h>@#include <hipblas/hipblas.h>@' $src
sed -i 's@#include <hiprand_kernel.h>@#include <hiprand/hiprand_kernel.h>@' $src
done
done

# hipify was run in parallel above
# don't copy the tmp file if it is unchanged
for ext in hip cuh h cpp
do
for src in $(find ./gpu-rocm -name "*.$ext.tmp")
do
dst=${src%.tmp}
if test -f $dst
then
if diff -q $src $dst >& /dev/null
then
echo "$dst [unchanged]"
rm $src
else
echo "$dst"
mv $src $dst
fi
else
echo "$dst"
mv $src $dst
fi
done
done

# copy over CMakeLists.txt
for src in $(find ./gpu -name "CMakeLists.txt")
do
dst=$(echo $src | sed 's@./gpu@./gpu-rocm@')
if test -f $dst
then
if diff -q $src $dst >& /dev/null
then
echo "$dst [unchanged]"
else
echo "$dst"
cp $src $dst
fi
else
echo "$dst"
cp $src $dst
fi
done

# Copy over other files
for ext in py
do
for src in $(find ./gpu -name "*.$ext")
do
dst=$(echo $src | sed 's@./gpu@./gpu-rocm@')
if test -f $dst
then
if diff -q $src $dst >& /dev/null
then
echo "$dst [unchanged]"
else
echo "$dst"
cp $src $dst
fi
else
echo "$dst"
cp $src $dst
fi
done
done


###################################################################################
# C_API Support
###################################################################################

# Now get the c_api dir
# This points to the faiss/c_api dir
top_c_api=$(dirname "${BASH_SOURCE[0]}")/../../c_api
echo "top=$top_c_api"
cd ../$top_c_api
echo "pwd=`pwd`"


# create all destination directories for hipified files into sibling 'gpu-rocm' directory
for src in $(find ./gpu -type d)
do
dst=$(echo $src | sed 's/gpu/gpu-rocm/')
echo "Creating $dst"
mkdir -p $dst
done

# run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
# run all files in parallel to speed up
for ext in cu cuh h cpp c
do
for src in $(find ./gpu -name "*.$ext")
do
dst=$(echo $src | sed 's@./gpu@./gpu-rocm@')
hipify-perl -o=$dst.tmp $src &
done
done
wait

# rename all hipified *.cu files to *.hip
for src in $(find ./gpu-rocm -name "*.cu.tmp")
do
dst=${src%.cu.tmp}.hip.tmp
mv $src $dst
done

# replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
# replace thrust::cuda::par with thrust::hip::par
# adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
# adjust header path location for hiprand_kernel.h to avoid unnecessary deprecation warnings
for ext in hip cuh h cpp c
do
for src in $(find ./gpu-rocm -name "*.$ext.tmp")
do
sed -i 's@#include <faiss/gpu/@#include <faiss/gpu-rocm/@' $src
sed -i 's@thrust::cuda::par@thrust::hip::par@' $src
sed -i 's@#include <hipblas.h>@#include <hipblas/hipblas.h>@' $src
sed -i 's@#include <hiprand_kernel.h>@#include <hiprand/hiprand_kernel.h>@' $src
done
done

# hipify was run in parallel above
# don't copy the tmp file if it is unchanged
for ext in hip cuh h cpp c
do
for src in $(find ./gpu-rocm -name "*.$ext.tmp")
do
dst=${src%.tmp}
if test -f $dst
then
if diff -q $src $dst >& /dev/null
then
echo "$dst [unchanged]"
rm $src
else
echo "$dst"
mv $src $dst
fi
else
echo "$dst"
mv $src $dst
fi
done
done

# copy over CMakeLists.txt
for src in $(find ./gpu -name "CMakeLists.txt")
do
dst=$(echo $src | sed 's@./gpu@./gpu-rocm@')
if test -f $dst
then
if diff -q $src $dst >& /dev/null
then
echo "$dst [unchanged]"
else
echo "$dst"
cp $src $dst
fi
else
echo "$dst"
cp $src $dst
fi
done
Loading
Loading