Skip to content

Commit

Permalink
Working on OpenCL kernel.
Browse files Browse the repository at this point in the history
  • Loading branch information
chadmv committed May 13, 2015
1 parent 990b7d8 commit 61a20b4
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 54 deletions.
2 changes: 1 addition & 1 deletion cgcmake
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Expand Up @@ -10,8 +10,9 @@ set(SOURCE_FILES
#"cvWrapRebindCmd.h"
"common.cpp"
"common.h"
"cvwrap.cl"
)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MAYA_CXX_FLAGS} /arch:AVX")
find_package(Maya REQUIRED)

include_directories(${MAYA_INCLUDE_DIR})
Expand Down
4 changes: 2 additions & 2 deletions src/common.cpp
Expand Up @@ -421,13 +421,13 @@ void GetValidUpAndNormal(const MDoubleArray& weights, const MPointArray& points,
// Adjust up if it's parallel to normal or if it's zero length
if (abs((unitUp * normal) - 1.0) < 0.001 || up.length() < 0.0001) {
for (unsigned int j = 0; j < weights.length()-1; ++j) {
up -= (points[sampleIds[j]] - origin) * weights[j];
unitUp = up.normal();
if (abs((unitUp * normal) - 1.0) > 0.001 && up.length() > 0.0001) {
// If the up and normal vectors are no longer parallel and the up vector has a length,
// then we are good to go.
break;
}
up -= (points[sampleIds[j]] - origin) * weights[j];
unitUp = up.normal();
}
up.normalize();
} else {
Expand Down
76 changes: 61 additions & 15 deletions src/cvWrapDeformer.cpp
Expand Up @@ -249,6 +249,7 @@ MStatus CVWrap::deform(MDataBlock& data, MItGeometry& itGeo, const MMatrix& loca
taskData_.resize(geomIndex+1);
}
TaskData& taskData = taskData_[geomIndex];
std::cerr << "deform\n";

// Get driver geo
MDataHandle hDriverGeo = data.inputValue(aDriverGeo, &status);
Expand Down Expand Up @@ -380,13 +381,12 @@ MThreadRetVal CVWrap::EvaluateWrap(void *pParam) {

MPoint origin;
MVector normal, up;
CalculateBasisComponents(sampleWeights[index], baryCoords[index], triangleVerts[i],
CalculateBasisComponents(sampleWeights[index], baryCoords[index], triangleVerts[index],
driverPoints, driverNormals, sampleIds[index], alignedStorage,
origin, up, normal);

CreateMatrix(origin, normal, up, matrix);
matrix = scaleMatrix * matrix;
MPoint dpt = points[i];
MMatrix tempMatrix = bindMatrices[index] * matrix;
MPoint newPt = ((points[i] * drivenMatrix) * (bindMatrices[index] * matrix)) * drivenInverseMatrix;
points[i] = points[i] + ((newPt - points[i]) * paintWeights[i] * env);
Expand Down Expand Up @@ -435,7 +435,6 @@ CVWrapGPU::~CVWrapGPU() {

bool CVWrapGPU::ValidateNode(MDataBlock& block, const MEvaluationNode& evaluationNode,
const MPlug& plug, MStringArray* messages) {
std::cerr << "CVWrapGPU::ValidateNode\n";
return true;
}

Expand All @@ -447,21 +446,30 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block,
const MAutoCLEvent inputEvent,
MAutoCLMem outputBuffer,
MAutoCLEvent& outputEvent) {
MStatus status;
// evaluate has two main pieces of work. I need to transfer any data I care about onto the GPU, and I need to run my OpenCL Kernel.
// First, transfer the data. offset has two pieces of data I need to transfer to the GPU, the weight array and the offset matrix.
// I don't need to transfer down the input position buffer, that is already handled by the deformer evaluator, the points are in inputBuffer.
numElements_ = numElements;
EnqueueBindData(block, evaluationNode, plug);
EnqueueDriverData(block, evaluationNode, plug);
EnqueuePaintMapData(block, evaluationNode, numElements, plug);
status = EnqueueBindData(block, evaluationNode, plug);
CHECK_MSTATUS(status);
status = EnqueueDriverData(block, evaluationNode, plug);
CHECK_MSTATUS(status);
status = EnqueuePaintMapData(block, evaluationNode, numElements, plug);
CHECK_MSTATUS(status);

// Now that all the data we care about is on the GPU, setup and run the OpenCL Kernel
if (!kernel_.get()) {
// Load the OpenCL kernel if we haven't yet.
MString openCLKernelFile(pluginLoadPath);
openCLKernelFile += "/cvwrap.cl";
kernel_ = MOpenCLInfo::getOpenCLKernel(openCLKernelFile, CVWrap::kName);
kernel_ = MOpenCLInfo::getOpenCLKernel(openCLKernelFile, "cvwrap");
if (kernel_.isNull()) {
std::cerr << "Could not compile kernel " << openCLKernelFile << "\n";
return MPxGPUDeformer::kDeformerFailure;
}
}
std::cerr << "Evaluate\n";

cl_int err = CL_SUCCESS;

Expand All @@ -472,11 +480,27 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block,
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)inputBuffer.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)fCLWeights.getReadOnlyRef());
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)driverPoints_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)driverNormals_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)paintWeights_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleCounts_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleOffsets_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleIds_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleWeights_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)triangleVerts_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)fOffsetMatrix.getReadOnlyRef());
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)baryCoords_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_uint), (void*)&fNumElements);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)bindMatrices_.getReadOnlyRef());
MOpenCLInfo::checkCLErrorStatus(err);
err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_uint), (void*)&numElements_);
MOpenCLInfo::checkCLErrorStatus(err);

// Figure out a good work group size for our kernel.
Expand All @@ -492,14 +516,16 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block,
MOpenCLInfo::checkCLErrorStatus(err);

size_t localWorkSize = 256;
if (retSize > 0) localWorkSize = workGroupSize;
size_t globalWorkSize = (localWorkSize - fNumElements % localWorkSize) + fNumElements; // global work size must be a multiple of localWorkSize
if (retSize > 0) {
localWorkSize = workGroupSize;
}
// global work size must be a multiple of localWorkSize
size_t globalWorkSize = (localWorkSize - numElements_ % localWorkSize) + numElements_;

// set up our input events. The input event could be NULL, in that case we need to pass
// slightly different parameters into clEnqueueNDRangeKernel
unsigned int numInputEvents = 0;
if (inputEvent.get())
{
if (inputEvent.get()) {
numInputEvents = 1;
}

Expand Down Expand Up @@ -556,13 +582,17 @@ MStatus CVWrapGPU::EnqueueBindData(MDataBlock& data, const MEvaluationNode& eval
// Store samples per vertex
arraySize = taskData.sampleIds.size();
int* samplesPerVertex = new int[arraySize];
int* sampleOffsets = new int[arraySize];
int totalSamples = 0;
for(size_t i = 0; i < taskData.sampleIds.size(); ++i) {
samplesPerVertex[i] = (int)taskData.sampleIds[i].length();
totalSamples += samplesPerVertex[i];
sampleOffsets[i] = totalSamples;
}
err = EnqueueBuffer(sampleCounts_, arraySize * sizeof(int), (void*)samplesPerVertex);
err = EnqueueBuffer(sampleOffsets_, arraySize * sizeof(int), (void*)sampleOffsets);
delete [] samplesPerVertex;
delete [] sampleOffsets;

// Store sampleIds and sampleWeights
int* sampleIds = new int[totalSamples];
Expand Down Expand Up @@ -596,7 +626,7 @@ MStatus CVWrapGPU::EnqueueBindData(MDataBlock& data, const MEvaluationNode& eval
err = EnqueueBuffer(baryCoords_, arraySize * sizeof(float), (void*)baryCoords);
delete [] triangleVerts;
delete [] baryCoords;

return MS::kSuccess;
}


Expand Down Expand Up @@ -680,5 +710,21 @@ MStatus CVWrapGPU::EnqueuePaintMapData(MDataBlock& data,
delete [] paintWeights;
return MS::kSuccess;
}


void CVWrapGPU::terminate() {
driverPoints_.reset();
driverNormals_.reset();
paintWeights_.reset();
bindMatrices_.reset();
sampleCounts_.reset();
sampleIds_.reset();
sampleWeights_.reset();
triangleVerts_.reset();
baryCoords_.reset();
MOpenCLInfo::releaseOpenCLKernel(kernel_);
kernel_.reset();
}

#endif

1 change: 1 addition & 0 deletions src/cvWrapDeformer.h
Expand Up @@ -121,6 +121,7 @@ class CVWrapGPU : public MPxGPUDeformer {
MAutoCLMem paintWeights_;
MAutoCLMem bindMatrices_;
MAutoCLMem sampleCounts_;
MAutoCLMem sampleOffsets_;
MAutoCLMem sampleIds_;
MAutoCLMem sampleWeights_;
MAutoCLMem triangleVerts_;
Expand Down
176 changes: 141 additions & 35 deletions src/cvwrap.cl
Expand Up @@ -2,43 +2,149 @@
offset kernels
*/

__kernel void offset(
__global float* finalPos, //float3
__global const float* initialPos, //float3
__global const float* weights,
__global const float4* matrices, //first matrix is offset matrix, second matrix is offset matrix inverse
__kernel void cvwrap(
__global float* finalPos, //float3
__global const float* initialPos, //float3
__global const float* driverPoints, //float3
__global const float* driverNormals, //float3
__global const float* paintWeights,
__global const int* sampleCounts,
__global const int* sampleOffsets,
__global const int* sampleIds,
__global const float* sampleWeights,
__global const int* triangleVerts, //int3
__global const float* baryCoords, //float3
__global const float4* bindMatrices,
const uint positionCount)
{
unsigned int positionId = get_global_id(0); // access finalPos and initialPos using this value
if (positionId >= positionCount ) return; // We create an execute unit for more indices then we have data for, just exit early if this guy if one of the extras
if (positionId >= positionCount) {
// We create an execute unit for more indices then we have data for, just exit early if this guy if one of the extras
return;
}
unsigned int positionOffset = positionId * 3; // Base positions are float3 when they come in here!
float4 initialPosition;
initialPosition.x = initialPos[positionOffset];
initialPosition.y = initialPos[positionOffset+1];
initialPosition.z = initialPos[positionOffset+2];
initialPosition.w = 1.0f;

float4 finalPosition;
finalPosition.x = 0.0f;
finalPosition.y = 0.0f;
finalPosition.z = 0.0f;
finalPosition.w = 1.0f;

__global const float4* matrixInverse = &(matrices[4]);
__global const float4* matrix = matrices;

// point *= matrix inverse
finalPosition.x = dot(initialPosition, matrixInverse[0]);
finalPosition.y = dot(initialPosition, matrixInverse[1]);
finalPosition.z = dot(initialPosition, matrixInverse[2]);

// pt.y += weight
finalPosition.y += weights[positionId];

// point *= matrix
// can't write back into finalPosition here, we need to use the same value to calculate xyz
// instead write into global memory
finalPos[positionOffset] = dot(finalPosition, matrix[0]);
finalPos[positionOffset+1] = dot(finalPosition, matrix[1]);
finalPos[positionOffset+2] = dot(finalPosition, matrix[2]);


// Start with the recreated point and normal using the barycentric coordinates of the hit point.
float baryA = baryCoords[positionOffset];
float baryB = baryCoords[positionOffset+1];
float baryC = baryCoords[positionOffset+2];
int triVertA = triangleVerts[positionOffset];
int triVertB = triangleVerts[positionOffset+1];
int triVertC = triangleVerts[positionOffset+2];
float hitPointX = driverPoints[triVertA] * baryA +
driverPoints[triVertB] * baryB +
driverPoints[triVertC] * baryC;
float hitPointY = driverPoints[triVertA+1] * baryA +
driverPoints[triVertB+1] * baryB +
driverPoints[triVertC+1] * baryC;
float hitPointZ = driverPoints[triVertA+2] * baryA +
driverPoints[triVertB+2] * baryB +
driverPoints[triVertC+2] * baryC;
float hitNormalX = driverNormals[triVertA] * baryA +
driverNormals[triVertB] * baryB +
driverNormals[triVertC] * baryC;
float hitNormalY = driverNormals[triVertA+1] * baryA +
driverNormals[triVertB+1] * baryB +
driverNormals[triVertC+1] * baryC;
float hitNormalZ = driverNormals[triVertA+2] * baryA +
driverNormals[triVertB+2] * baryB +
driverNormals[triVertC+2] * baryC;

// Create the barycentric point and normal.
// int hitIndex = sampleOffsets[positionId] + sampleCounts[positionId] - 1;
// float hitWeight = sampleWeights[hitIndex];
// float originX = hitPointX * hitWeight;
// float originY = hitPointY * hitWeight;
// float originZ = hitPointZ * hitWeight;
// float normalX = hitNormalX * hitWeight;
// float normalY = hitNormalY * hitWeight;
// float normalZ = hitNormalZ * hitWeight;

// // Then use the weighted adjacent data.
// for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) {
// float sw = sampleWeights[j];
// originX += driverPoints[sampleIds[j]*3] * sw;
// originY += driverPoints[sampleIds[j]*3+1] * sw;
// originZ += driverPoints[sampleIds[j]*3+2] * sw;

// normalX += driverNormals[sampleIds[j]*3] * sw;
// normalY += driverNormals[sampleIds[j]*3+1] * sw;
// normalZ += driverNormals[sampleIds[j]*3+2] * sw;
// }

// // Calculate the up vector
// float upX = (hitPointX - originX) * hitWeight;
// float upY = (hitPointY - originY) * hitWeight;
// float upZ = (hitPointZ - originZ) * hitWeight;
// for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) {
// float sw = sampleWeights[j];
// upX += (driverPoints[sampleIds[j]*3] - originX) * sw;
// upY += (driverPoints[sampleIds[j]*3+1] - originY) * sw;
// upZ += (driverPoints[sampleIds[j]*3+2] - originZ) * sw;
// }

// // Use float3 so we can use the built-in functions. We are mostly using single floats
// // because the preferred vector width of most gpu's these days is 1.
// float3 up = (float3)(upX, upY, upZ);
// float3 normal = (float3)(normalX, normalY, normalZ);
// float3 unitUp = fast_normalize(up);
// float upLength = fast_length(up);
// if (fabs(dot(unitUp, normal) - 1.0f) > 0.001f && upLength > 0.0001f) {
// for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) {
// up.x -= (driverPoints[sampleIds[j]*3] - originX) * sampleWeights[j];
// up.y -= (driverPoints[sampleIds[j]*3+1] - originY) * sampleWeights[j];
// up.z -= (driverPoints[sampleIds[j]*3+2] - originZ) * sampleWeights[j];
// unitUp = fast_normalize(up);
// upLength = fast_length(up);
// if (fabs(dot(unitUp, normal) - 1.0f) > 0.001f && upLength > 0.0001f) {
// // If the up and normal vectors are no longer parallel and the up vector has a length,
// // then we are good to go.
// break;
// }
// }
// up = fast_normalize(up);
// } else {
// up = unitUp;
// }

// // Create the transform matrix
// // Store by columns so we can use dot to multiply with the bind matrix
// float3 x = cross(normal, up);
// float3 z = cross(normal, x);
// float4 matrix0 = (float4)(x.x, normal.x, z.x, originX);
// float4 matrix1 = (float4)(x.y, normal.y, z.y, originY);
// float4 matrix2 = (float4)(x.z, normal.z, z.z, originZ);
// float4 matrix3 = (float4)(0.0f, 0.0f, 0.0f, 1.0f);

// // TODO: scale matrix mult

// // Multiply bindMatrix with matrix
//__global const float4* bindMatrix = &(bindMatrices[positionId*4]);
// float4 bindMatrix0 = bindMatrix[0];
// float4 bindMatrix1 = bindMatrix[1];
// float4 bindMatrix2 = bindMatrix[2];
// float4 m0 = (float4)(dot(bindMatrix0, matrix0),
// dot(bindMatrix0, matrix1),
// dot(bindMatrix0, matrix2),
// dot(bindMatrix0, matrix3));
// float4 m1 = (float4)(dot(bindMatrix1, matrix0),
// dot(bindMatrix1, matrix1),
// dot(bindMatrix1, matrix2),
// dot(bindMatrix1, matrix3));
// float4 m2 = (float4)(dot(bindMatrix2, matrix0),
// dot(bindMatrix2, matrix1),
// dot(bindMatrix2, matrix2),
// dot(bindMatrix2, matrix3));

float4 initialPosition = (float4)(initialPos[positionOffset],
initialPos[positionOffset+1],
initialPos[positionOffset+2],
1.0f);
/*finalPos[positionOffset] = dot(initialPosition, m0);
finalPos[positionOffset+1] = dot(initialPosition, m1);
finalPos[positionOffset+2] = dot(initialPosition, m2);*/
finalPos[positionOffset] = hitPointX;
finalPos[positionOffset+1] = hitPointY;
finalPos[positionOffset+2] = hitPointZ;
}

0 comments on commit 61a20b4

Please sign in to comment.