Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/library/action.transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
//general swap kernel takes care of all ratio
OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
}

//std::cout << programCode << std::endl;
cl_int status = CL_SUCCESS;
cl_device_id Device = NULL;
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
Expand Down
16 changes: 8 additions & 8 deletions src/library/generator.transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1734,16 +1734,16 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
if (i + 256 < LDS_per_WG)
{
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i
<< ");" << std::endl;
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
<< "]);" << std::endl;
}
else
{
// need to handle boundary
clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i
<< ");" << std::endl;
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
<< "]);" << std::endl;
clKernWrite(transKernel, 6) << "}" << std::endl;
}
}
Expand Down Expand Up @@ -1848,17 +1848,17 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
{
//clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
//clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
<< ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
<< "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
<< "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
}
else
{
// need to handle boundary
clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
<< ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
<< "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
<< "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
clKernWrite(transKernel, 6) << "}" << std::endl;
}
}
Expand Down
17 changes: 9 additions & 8 deletions src/library/plan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,14 @@ static bool pow235(size_t num, size_t &pow2, size_t &pow3, size_t &pow5)
return true;
}

static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums, clfftPrecision precision)
static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums, clfftPrecision precision, size_t threshold)
{
/* a helper function to split big 1D to friendly 2D sizes for inplace transpose kernels
currently only radix 2, 3 and 5 are supported
the algorithm looks for ways to split up the 1D into 2D such that one of the dimensions is multiples of the other dimension.
And this mupliple is radix2, 3 or 5.
each splited dimentsion should be further splited until that it is smaller than 4096
*/
size_t threshold = 4096;
if (precision == CLFFT_DOUBLE)
threshold = 2048;
if (num <= threshold)
return true;
if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0)
Expand Down Expand Up @@ -174,8 +171,8 @@ static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums,
splitVec.push_back(temp);
splitNums.push_back(splitVec);

status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision);
status = status && split1D_for_inplace(temp, splitNums, precision);
status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision, threshold);
status = status && split1D_for_inplace(temp, splitNums, precision, threshold);
return status;

}
Expand Down Expand Up @@ -794,13 +791,17 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if (fftPlan->length[0] == 354294)
clLengths[1] = 243;
*/
size_t threshold = 4096;
if (fftPlan->precision == CLFFT_DOUBLE)
threshold = 2048;
if (clfftGetRequestLibNoMemAlloc() &&
fftPlan->placeness == CLFFT_INPLACE &&
(fftPlan->inputLayout == fftPlan->outputLayout) )
(fftPlan->inputLayout == fftPlan->outputLayout)
&& fftPlan->length[0] > threshold)
{
//for inplace fft with inplace transpose, the split logic is different
vector<vector<size_t> > splitNums;
bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision);
bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision, threshold);
if (implemented)
clLengths[1] = splitNums[0][0];
}
Expand Down