Skip to content

Commit

Permalink
enable btile prefetch, got 250Tflops (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
taozha2 committed Apr 19, 2024
1 parent ca68c3c commit a538058
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 39 deletions.
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ export LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/:${sycl_compiler_path}/l
export IGC_ShaderDumpEnable=1
export IGC_DumpToCustomDir=./mm_dumps_prefetch_coop
#export IGC_VATemp=1
cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CXX_FLAGS=" -DITEM_SIZE_X=4 -DITEM_SIZE_Y=32 -DSG_SIZE_X=64 -DSG_SIZE_Y=ITEM_SIZE_Y -DWG_SIZE_X=256 -DWG_SIZE_Y=256 -DKK=1 -DPREFETCH_DEFAULT -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target
cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CXX_FLAGS=" -DITEM_SIZE_X=4 -DITEM_SIZE_Y=32 -DSG_SIZE_X=64 -DSG_SIZE_Y=ITEM_SIZE_Y -DWG_SIZE_X=256 -DWG_SIZE_Y=256 -DKK=2 -DPREFETCH_DEFAULT -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target

4 changes: 0 additions & 4 deletions examples/cute/tutorial/pvc_sycl/pvc_prefetch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@
#define HELPER_NAMEX(PREFIX, MM, NN) PREFIX##_m##MM##_n##NN
#define HELPER_NAME(PREFIX, MM, NN) HELPER_NAMEX(PREFIX, MM, NN)

#if !defined(PREFETCH_DISTANCE)
#define PREFETCH_DISTANCE 1
#endif

void HELPER_NAME(atile_prefetch_rowmajor, MM,
NN)(global ushort *A, int tM, int K, int m, int prefetch_k) {
for (int kk = 0; kk < KK; kk += 2) {
Expand Down
71 changes: 37 additions & 34 deletions examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ size_t matrixSize = 4096;

#define WARMUP_ITERATIONS 10

#if !defined(PREFETCH_DISTANCE)
#define PREFETCH_DISTANCE 1
#endif

std::string makeTestName(const std::string &func, int tM, int tN, int tK,
int MM, int NN, size_t M, size_t N, size_t K) {
std::ostringstream ret;
Expand All @@ -48,7 +52,7 @@ std::string makeTestName(const std::string &func, int tM, int tN, int tK,
}

template <typename T>
static void fill_matrix(T* M, size_t numRows, size_t numCols) {
static void fill_matrix(T *M, size_t numRows, size_t numCols) {
if (identityData) {
for (size_t r = 0; r < numRows; r++) {
for (size_t c = 0; c < numCols; c++) {
Expand All @@ -73,10 +77,9 @@ static void fill_matrix(T* M, size_t numRows, size_t numCols) {
}
}


template <typename T>
static void vnni_matrix(T* dst, const T* src,
size_t numRows, size_t numCols, size_t factor) {
static void vnni_matrix(T *dst, const T *src, size_t numRows, size_t numCols,
size_t factor) {
for (size_t r = 0; r < numRows / factor; r++) {
for (size_t c = 0; c < numCols; c++) {
for (size_t k = 0; k < factor; k++) {
Expand All @@ -88,7 +91,7 @@ static void vnni_matrix(T* dst, const T* src,
}

template <typename DstT, typename SrcT>
static void compute_reference(DstT* C, SrcT* A, SrcT* B, size_t M, size_t N,
static void compute_reference(DstT *C, SrcT *A, SrcT *B, size_t M, size_t N,
size_t K) {
for (size_t m = 0; m < M; m++) {
for (size_t n = 0; n < N; n++) {
Expand All @@ -103,8 +106,7 @@ static void compute_reference(DstT* C, SrcT* A, SrcT* B, size_t M, size_t N,
}

template <typename T>
void check_results(size_t M, size_t N, const T* C,
const T* C_ref) {
void check_results(size_t M, size_t N, const T *C, const T *C_ref) {
float err = 0.f;
size_t error_cnt = 0;
for (size_t m = 0; m < M; m++) {
Expand Down Expand Up @@ -142,11 +144,9 @@ inline size_t time_event(sycl::event &e) {
}

template <int tM, int tN, int tK, int MM, int NN>
static void
go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc* c_vec,
dtype_a* a, dtype_b* b,
size_t M, size_t N, size_t K,
dtype_acc* C_ref) {
static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
dtype_a *a, dtype_b *b, size_t M,
size_t N, size_t K, dtype_acc *C_ref) {
printf("%80s: ",
makeTestName(__FUNCTION__, tM, tN, tK, MM, NN, M, N, K).c_str());
fflush(stdout);
Expand Down Expand Up @@ -221,8 +221,8 @@ go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc* c_vec,
#ifdef PREFETCH_DEFAULT
for (int p = 0; p < PREFETCH_DISTANCE; p++) {
#ifdef B_VNNI
// HELPER_NAME(btile_block_prefetch_vnni, 4, 4)
//((ushort *)B, tN, K, N, prefetch_k, n);
HELPER_NAME(btile_block_prefetch_vnni, 4, 4)
((ushort *)B, tN, K, N, prefetch_k, n);
#else
HELPER_NAME(btile_block_prefetch_rowmajor, 4, 4)
((ushort *)B, tN, K, N, prefetch_k, n);
Expand All @@ -236,25 +236,24 @@ go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc* c_vec,
for (int k = 0; k < K; k += tK * KK) {
for (int kk = 0; kk < KK; kk++) {
copy(A_copy, tAi(_, _, k + kk * tK), tAr(_, _, kk));
copy(B_copy, tBi(_, (k + kk * tK)/ 2, _), tBr(_, _, kk));
copy(B_copy, tBi(_, (k + kk * tK) / 2, _), tBr(_, _, kk));
}


#ifdef PREFETCH_DEFAULT
if (k % ((PREFETCH_DISTANCE)*tK) == 0) {
for (int p = 0; p < PREFETCH_DISTANCE; p++) {
// if (k % ((PREFETCH_DISTANCE)*tK) == 0) {
for (int p = 0; p < PREFETCH_DISTANCE; p++) {
#ifdef B_VNNI
// HELPER_NAME(btile_block_prefetch_vnni, 4, 4)
// ((ushort *)B, tN, K, N, prefetch_k, n);
HELPER_NAME(btile_block_prefetch_vnni, 4, 4)
((ushort *)B, tN, K, N, prefetch_k, n);
#else
HELPER_NAME(btile_block_prefetch_rowmajor, 4, 4)
((ushort *)B, tN, K, N, prefetch_k, n);
HELPER_NAME(btile_block_prefetch_rowmajor, 4, 4)
((ushort *)B, tN, K, N, prefetch_k, n);
#endif
HELPER_NAME(atile_block_prefetch_rowmajor, 4, 4)
((ushort *)A, tM, M, K, m, prefetch_k);
prefetch_k += tK * KK;
}
HELPER_NAME(atile_block_prefetch_rowmajor, 4, 4)
((ushort *)A, tM, M, K, m, prefetch_k);
prefetch_k += tK * KK;
}
// }
#endif
gemm(tiled_mma, tAr, tBr, tCr);
}
Expand Down Expand Up @@ -299,11 +298,16 @@ int main(int argc, char **argv) {
const auto N = matrixSize;
const auto K = matrixSize;

dtype_a* A_vec = (dtype_a*)syclcompat::malloc_shared(sizeof(dtype_a) * M * K);
dtype_b* B_vec = (dtype_b*)syclcompat::malloc_shared(sizeof(dtype_b) * N * K);
dtype_b* Bvnni_vec = (dtype_b*)syclcompat::malloc_shared(sizeof(dtype_b) * N * K);
dtype_acc* C_vec = (dtype_acc*)syclcompat::malloc_shared(sizeof(dtype_acc) * M * N);
dtype_acc* C_ref = (dtype_acc*)syclcompat::malloc_shared(sizeof(dtype_acc) * M * N);
dtype_a *A_vec =
(dtype_a *)syclcompat::malloc_shared(sizeof(dtype_a) * M * K);
dtype_b *B_vec =
(dtype_b *)syclcompat::malloc_shared(sizeof(dtype_b) * N * K);
dtype_b *Bvnni_vec =
(dtype_b *)syclcompat::malloc_shared(sizeof(dtype_b) * N * K);
dtype_acc *C_vec =
(dtype_acc *)syclcompat::malloc_shared(sizeof(dtype_acc) * M * N);
dtype_acc *C_ref =
(dtype_acc *)syclcompat::malloc_shared(sizeof(dtype_acc) * M * N);

printf("Initializing source matrices...\n");
fill_matrix(A_vec, M, K);
Expand Down Expand Up @@ -332,9 +336,8 @@ int main(int argc, char **argv) {
if (validate) {
printf("Computing reference...\n");
get_gemm_gold<dtype_a, dtype_b, dtype_acc>(
M, N, K, mem_layout::row_major, mem_layout::row_major,
(dtype_a *)A_vec, (dtype_b *)B_vec,
(dtype_acc *)C_ref);
M, N, K, mem_layout::row_major, mem_layout::row_major, (dtype_a *)A_vec,
(dtype_b *)B_vec, (dtype_acc *)C_ref);
// compute_reference(C_ref, A_vec, B_vec, M, N, K);
}

Expand Down

0 comments on commit a538058

Please sign in to comment.