From edf0746209c6985ba851a6a3a894a9e71c225ab9 Mon Sep 17 00:00:00 2001 From: Timmy Date: Thu, 26 May 2016 12:01:37 -0500 Subject: [PATCH 1/2] typo fix --- src/library/action.transpose.cpp | 2 +- src/library/generator.transpose.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/library/action.transpose.cpp b/src/library/action.transpose.cpp index aa7dc92e..874cf426 100644 --- a/src/library/action.transpose.cpp +++ b/src/library/action.transpose.cpp @@ -282,7 +282,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep //general swap kernel takes care of all ratio OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!")); } - + //std::cout << programCode << std::endl; cl_int status = CL_SUCCESS; cl_device_id Device = NULL; status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL); diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp index 5c3df10b..a04a43ae 100644 --- a/src/library/generator.transpose.cpp +++ b/src/library/generator.transpose.cpp @@ -1734,16 +1734,16 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig if (i + 256 < LDS_per_WG) { clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim - << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i - << ");" << std::endl; + << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i + << "]);" << std::endl; } else { // need to handle boundary clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl; clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim - << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i - << ");" << std::endl; + << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i + << "]);" << std::endl; clKernWrite(transKernel, 6) << "}" << std::endl; } } @@ -1848,17 +1848,17 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig { //clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl; //clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl; - clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim + clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim << ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim - << "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl; + << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl; } else { // need to handle boundary clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl; - clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim + clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim << ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim - << "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl; + << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl; clKernWrite(transKernel, 6) << "}" << std::endl; } } From dd116f5de98cb3d9760f9562a59ee0f8de0fca3d Mon Sep 17 00:00:00 2001 From: Timmy Date: Thu, 26 May 2016 13:22:23 -0500 Subject: [PATCH 2/2] do not split 1d if size > &splitNums, clfftPrecision precision) +static bool split1D_for_inplace(size_t num, vector > &splitNums, clfftPrecision precision, size_t threshold) { /* a helper function to split big 1D to friendly 2D sizes for inplace transpose kernels currently only radix 2, 3 and 5 are supported @@ -74,9 +74,6 @@ static bool split1D_for_inplace(size_t num, vector > &splitNums, And this mupliple is radix2, 3 or 5. each splited dimentsion should be further splited until that it is smaller than 4096 */ - size_t threshold = 4096; - if (precision == CLFFT_DOUBLE) - threshold = 2048; if (num <= threshold) return true; if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0) @@ -174,8 +171,8 @@ static bool split1D_for_inplace(size_t num, vector > &splitNums, splitVec.push_back(temp); splitNums.push_back(splitVec); - status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision); - status = status && split1D_for_inplace(temp, splitNums, precision); + status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision, threshold); + status = status && split1D_for_inplace(temp, splitNums, precision, threshold); return status; } @@ -794,13 +791,17 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma if (fftPlan->length[0] == 354294) clLengths[1] = 243; */ + size_t threshold = 4096; + if (fftPlan->precision == CLFFT_DOUBLE) + threshold = 2048; if (clfftGetRequestLibNoMemAlloc() && fftPlan->placeness == CLFFT_INPLACE && - (fftPlan->inputLayout == fftPlan->outputLayout) ) + (fftPlan->inputLayout == fftPlan->outputLayout) + && fftPlan->length[0] > threshold) { //for inplace fft with inplace transpose, the split logic is different vector > splitNums; - bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision); + bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision, threshold); if (implemented) clLengths[1] = splitNums[0][0]; }