diff --git a/cgcmake b/cgcmake index fb1fd4d..b673fc3 160000 --- a/cgcmake +++ b/cgcmake @@ -1 +1 @@ -Subproject commit fb1fd4da8f1612bde6594ae899a36ac685605c1e +Subproject commit b673fc30af39b9bf1d5a96237fd000df069449c2 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9110ea3..2578032 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,8 +10,9 @@ set(SOURCE_FILES #"cvWrapRebindCmd.h" "common.cpp" "common.h" + "cvwrap.cl" ) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MAYA_CXX_FLAGS} /arch:AVX") find_package(Maya REQUIRED) include_directories(${MAYA_INCLUDE_DIR}) diff --git a/src/common.cpp b/src/common.cpp index 040f11e..d20987d 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -421,13 +421,13 @@ void GetValidUpAndNormal(const MDoubleArray& weights, const MPointArray& points, // Adjust up if it's parallel to normal or if it's zero length if (abs((unitUp * normal) - 1.0) < 0.001 || up.length() < 0.0001) { for (unsigned int j = 0; j < weights.length()-1; ++j) { + up -= (points[sampleIds[j]] - origin) * weights[j]; + unitUp = up.normal(); if (abs((unitUp * normal) - 1.0) > 0.001 && up.length() > 0.0001) { // If the up and normal vectors are no longer parallel and the up vector has a length, // then we are good to go. break; } - up -= (points[sampleIds[j]] - origin) * weights[j]; - unitUp = up.normal(); } up.normalize(); } else { diff --git a/src/cvWrapDeformer.cpp b/src/cvWrapDeformer.cpp index e56f7dc..5fd39d2 100644 --- a/src/cvWrapDeformer.cpp +++ b/src/cvWrapDeformer.cpp @@ -249,6 +249,7 @@ MStatus CVWrap::deform(MDataBlock& data, MItGeometry& itGeo, const MMatrix& loca taskData_.resize(geomIndex+1); } TaskData& taskData = taskData_[geomIndex]; + std::cerr << "deform\n"; // Get driver geo MDataHandle hDriverGeo = data.inputValue(aDriverGeo, &status); @@ -380,13 +381,12 @@ MThreadRetVal CVWrap::EvaluateWrap(void *pParam) { MPoint origin; MVector normal, up; - CalculateBasisComponents(sampleWeights[index], baryCoords[index], triangleVerts[i], + CalculateBasisComponents(sampleWeights[index], baryCoords[index], triangleVerts[index], driverPoints, driverNormals, sampleIds[index], alignedStorage, origin, up, normal); CreateMatrix(origin, normal, up, matrix); matrix = scaleMatrix * matrix; - MPoint dpt = points[i]; MMatrix tempMatrix = bindMatrices[index] * matrix; MPoint newPt = ((points[i] * drivenMatrix) * (bindMatrices[index] * matrix)) * drivenInverseMatrix; points[i] = points[i] + ((newPt - points[i]) * paintWeights[i] * env); @@ -435,7 +435,6 @@ CVWrapGPU::~CVWrapGPU() { bool CVWrapGPU::ValidateNode(MDataBlock& block, const MEvaluationNode& evaluationNode, const MPlug& plug, MStringArray* messages) { - std::cerr << "CVWrapGPU::ValidateNode\n"; return true; } @@ -447,21 +446,30 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block, const MAutoCLEvent inputEvent, MAutoCLMem outputBuffer, MAutoCLEvent& outputEvent) { + MStatus status; // evaluate has two main pieces of work. I need to transfer any data I care about onto the GPU, and I need to run my OpenCL Kernel. // First, transfer the data. offset has two pieces of data I need to transfer to the GPU, the weight array and the offset matrix. // I don't need to transfer down the input position buffer, that is already handled by the deformer evaluator, the points are in inputBuffer. numElements_ = numElements; - EnqueueBindData(block, evaluationNode, plug); - EnqueueDriverData(block, evaluationNode, plug); - EnqueuePaintMapData(block, evaluationNode, numElements, plug); + status = EnqueueBindData(block, evaluationNode, plug); + CHECK_MSTATUS(status); + status = EnqueueDriverData(block, evaluationNode, plug); + CHECK_MSTATUS(status); + status = EnqueuePaintMapData(block, evaluationNode, numElements, plug); + CHECK_MSTATUS(status); // Now that all the data we care about is on the GPU, setup and run the OpenCL Kernel if (!kernel_.get()) { // Load the OpenCL kernel if we haven't yet. MString openCLKernelFile(pluginLoadPath); openCLKernelFile += "/cvwrap.cl"; - kernel_ = MOpenCLInfo::getOpenCLKernel(openCLKernelFile, CVWrap::kName); + kernel_ = MOpenCLInfo::getOpenCLKernel(openCLKernelFile, "cvwrap"); + if (kernel_.isNull()) { + std::cerr << "Could not compile kernel " << openCLKernelFile << "\n"; + return MPxGPUDeformer::kDeformerFailure; + } } + std::cerr << "Evaluate\n"; cl_int err = CL_SUCCESS; @@ -472,11 +480,27 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block, MOpenCLInfo::checkCLErrorStatus(err); err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)inputBuffer.getReadOnlyRef()); MOpenCLInfo::checkCLErrorStatus(err); - err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)fCLWeights.getReadOnlyRef()); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)driverPoints_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)driverNormals_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)paintWeights_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleCounts_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleOffsets_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleIds_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleWeights_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)triangleVerts_.getReadOnlyRef()); MOpenCLInfo::checkCLErrorStatus(err); - err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)fOffsetMatrix.getReadOnlyRef()); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)baryCoords_.getReadOnlyRef()); MOpenCLInfo::checkCLErrorStatus(err); - err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_uint), (void*)&fNumElements); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)bindMatrices_.getReadOnlyRef()); + MOpenCLInfo::checkCLErrorStatus(err); + err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_uint), (void*)&numElements_); MOpenCLInfo::checkCLErrorStatus(err); // Figure out a good work group size for our kernel. @@ -492,14 +516,16 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block, MOpenCLInfo::checkCLErrorStatus(err); size_t localWorkSize = 256; - if (retSize > 0) localWorkSize = workGroupSize; - size_t globalWorkSize = (localWorkSize - fNumElements % localWorkSize) + fNumElements; // global work size must be a multiple of localWorkSize + if (retSize > 0) { + localWorkSize = workGroupSize; + } + // global work size must be a multiple of localWorkSize + size_t globalWorkSize = (localWorkSize - numElements_ % localWorkSize) + numElements_; // set up our input events. The input event could be NULL, in that case we need to pass // slightly different parameters into clEnqueueNDRangeKernel unsigned int numInputEvents = 0; - if (inputEvent.get()) - { + if (inputEvent.get()) { numInputEvents = 1; } @@ -556,13 +582,17 @@ MStatus CVWrapGPU::EnqueueBindData(MDataBlock& data, const MEvaluationNode& eval // Store samples per vertex arraySize = taskData.sampleIds.size(); int* samplesPerVertex = new int[arraySize]; + int* sampleOffsets = new int[arraySize]; int totalSamples = 0; for(size_t i = 0; i < taskData.sampleIds.size(); ++i) { samplesPerVertex[i] = (int)taskData.sampleIds[i].length(); totalSamples += samplesPerVertex[i]; + sampleOffsets[i] = totalSamples; } err = EnqueueBuffer(sampleCounts_, arraySize * sizeof(int), (void*)samplesPerVertex); + err = EnqueueBuffer(sampleOffsets_, arraySize * sizeof(int), (void*)sampleOffsets); delete [] samplesPerVertex; + delete [] sampleOffsets; // Store sampleIds and sampleWeights int* sampleIds = new int[totalSamples]; @@ -596,7 +626,7 @@ MStatus CVWrapGPU::EnqueueBindData(MDataBlock& data, const MEvaluationNode& eval err = EnqueueBuffer(baryCoords_, arraySize * sizeof(float), (void*)baryCoords); delete [] triangleVerts; delete [] baryCoords; - + return MS::kSuccess; } @@ -680,5 +710,21 @@ MStatus CVWrapGPU::EnqueuePaintMapData(MDataBlock& data, delete [] paintWeights; return MS::kSuccess; } + + +void CVWrapGPU::terminate() { + driverPoints_.reset(); + driverNormals_.reset(); + paintWeights_.reset(); + bindMatrices_.reset(); + sampleCounts_.reset(); + sampleIds_.reset(); + sampleWeights_.reset(); + triangleVerts_.reset(); + baryCoords_.reset(); + MOpenCLInfo::releaseOpenCLKernel(kernel_); + kernel_.reset(); +} + #endif diff --git a/src/cvWrapDeformer.h b/src/cvWrapDeformer.h index 6c74e7a..0168463 100644 --- a/src/cvWrapDeformer.h +++ b/src/cvWrapDeformer.h @@ -121,6 +121,7 @@ class CVWrapGPU : public MPxGPUDeformer { MAutoCLMem paintWeights_; MAutoCLMem bindMatrices_; MAutoCLMem sampleCounts_; + MAutoCLMem sampleOffsets_; MAutoCLMem sampleIds_; MAutoCLMem sampleWeights_; MAutoCLMem triangleVerts_; diff --git a/src/cvwrap.cl b/src/cvwrap.cl index f095f2c..3c3bd06 100644 --- a/src/cvwrap.cl +++ b/src/cvwrap.cl @@ -2,43 +2,149 @@ offset kernels */ -__kernel void offset( - __global float* finalPos, //float3 - __global const float* initialPos, //float3 - __global const float* weights, - __global const float4* matrices, //first matrix is offset matrix, second matrix is offset matrix inverse +__kernel void cvwrap( + __global float* finalPos, //float3 + __global const float* initialPos, //float3 + __global const float* driverPoints, //float3 + __global const float* driverNormals, //float3 + __global const float* paintWeights, + __global const int* sampleCounts, + __global const int* sampleOffsets, + __global const int* sampleIds, + __global const float* sampleWeights, + __global const int* triangleVerts, //int3 + __global const float* baryCoords, //float3 + __global const float4* bindMatrices, const uint positionCount) { unsigned int positionId = get_global_id(0); // access finalPos and initialPos using this value - if (positionId >= positionCount ) return; // We create an execute unit for more indices then we have data for, just exit early if this guy if one of the extras + if (positionId >= positionCount) { + // We create an execute unit for more indices then we have data for, just exit early if this guy if one of the extras + return; + } unsigned int positionOffset = positionId * 3; // Base positions are float3 when they come in here! - float4 initialPosition; - initialPosition.x = initialPos[positionOffset]; - initialPosition.y = initialPos[positionOffset+1]; - initialPosition.z = initialPos[positionOffset+2]; - initialPosition.w = 1.0f; - - float4 finalPosition; - finalPosition.x = 0.0f; - finalPosition.y = 0.0f; - finalPosition.z = 0.0f; - finalPosition.w = 1.0f; - - __global const float4* matrixInverse = &(matrices[4]); - __global const float4* matrix = matrices; - - // point *= matrix inverse - finalPosition.x = dot(initialPosition, matrixInverse[0]); - finalPosition.y = dot(initialPosition, matrixInverse[1]); - finalPosition.z = dot(initialPosition, matrixInverse[2]); - - // pt.y += weight - finalPosition.y += weights[positionId]; - - // point *= matrix - // can't write back into finalPosition here, we need to use the same value to calculate xyz - // instead write into global memory - finalPos[positionOffset] = dot(finalPosition, matrix[0]); - finalPos[positionOffset+1] = dot(finalPosition, matrix[1]); - finalPos[positionOffset+2] = dot(finalPosition, matrix[2]); + + + // Start with the recreated point and normal using the barycentric coordinates of the hit point. + float baryA = baryCoords[positionOffset]; + float baryB = baryCoords[positionOffset+1]; + float baryC = baryCoords[positionOffset+2]; + int triVertA = triangleVerts[positionOffset]; + int triVertB = triangleVerts[positionOffset+1]; + int triVertC = triangleVerts[positionOffset+2]; + float hitPointX = driverPoints[triVertA] * baryA + + driverPoints[triVertB] * baryB + + driverPoints[triVertC] * baryC; + float hitPointY = driverPoints[triVertA+1] * baryA + + driverPoints[triVertB+1] * baryB + + driverPoints[triVertC+1] * baryC; + float hitPointZ = driverPoints[triVertA+2] * baryA + + driverPoints[triVertB+2] * baryB + + driverPoints[triVertC+2] * baryC; + float hitNormalX = driverNormals[triVertA] * baryA + + driverNormals[triVertB] * baryB + + driverNormals[triVertC] * baryC; + float hitNormalY = driverNormals[triVertA+1] * baryA + + driverNormals[triVertB+1] * baryB + + driverNormals[triVertC+1] * baryC; + float hitNormalZ = driverNormals[triVertA+2] * baryA + + driverNormals[triVertB+2] * baryB + + driverNormals[triVertC+2] * baryC; + + // Create the barycentric point and normal. + // int hitIndex = sampleOffsets[positionId] + sampleCounts[positionId] - 1; + // float hitWeight = sampleWeights[hitIndex]; + // float originX = hitPointX * hitWeight; + // float originY = hitPointY * hitWeight; + // float originZ = hitPointZ * hitWeight; + // float normalX = hitNormalX * hitWeight; + // float normalY = hitNormalY * hitWeight; + // float normalZ = hitNormalZ * hitWeight; + + // // Then use the weighted adjacent data. + // for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) { + // float sw = sampleWeights[j]; + // originX += driverPoints[sampleIds[j]*3] * sw; + // originY += driverPoints[sampleIds[j]*3+1] * sw; + // originZ += driverPoints[sampleIds[j]*3+2] * sw; + + // normalX += driverNormals[sampleIds[j]*3] * sw; + // normalY += driverNormals[sampleIds[j]*3+1] * sw; + // normalZ += driverNormals[sampleIds[j]*3+2] * sw; + // } + + // // Calculate the up vector + // float upX = (hitPointX - originX) * hitWeight; + // float upY = (hitPointY - originY) * hitWeight; + // float upZ = (hitPointZ - originZ) * hitWeight; + // for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) { + // float sw = sampleWeights[j]; + // upX += (driverPoints[sampleIds[j]*3] - originX) * sw; + // upY += (driverPoints[sampleIds[j]*3+1] - originY) * sw; + // upZ += (driverPoints[sampleIds[j]*3+2] - originZ) * sw; + // } + + // // Use float3 so we can use the built-in functions. We are mostly using single floats + // // because the preferred vector width of most gpu's these days is 1. + // float3 up = (float3)(upX, upY, upZ); + // float3 normal = (float3)(normalX, normalY, normalZ); + // float3 unitUp = fast_normalize(up); + // float upLength = fast_length(up); + // if (fabs(dot(unitUp, normal) - 1.0f) > 0.001f && upLength > 0.0001f) { + // for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) { + // up.x -= (driverPoints[sampleIds[j]*3] - originX) * sampleWeights[j]; + // up.y -= (driverPoints[sampleIds[j]*3+1] - originY) * sampleWeights[j]; + // up.z -= (driverPoints[sampleIds[j]*3+2] - originZ) * sampleWeights[j]; + // unitUp = fast_normalize(up); + // upLength = fast_length(up); + // if (fabs(dot(unitUp, normal) - 1.0f) > 0.001f && upLength > 0.0001f) { + // // If the up and normal vectors are no longer parallel and the up vector has a length, + // // then we are good to go. + // break; + // } + // } + // up = fast_normalize(up); + // } else { + // up = unitUp; + // } + + // // Create the transform matrix + // // Store by columns so we can use dot to multiply with the bind matrix + // float3 x = cross(normal, up); + // float3 z = cross(normal, x); + // float4 matrix0 = (float4)(x.x, normal.x, z.x, originX); + // float4 matrix1 = (float4)(x.y, normal.y, z.y, originY); + // float4 matrix2 = (float4)(x.z, normal.z, z.z, originZ); + // float4 matrix3 = (float4)(0.0f, 0.0f, 0.0f, 1.0f); + + // // TODO: scale matrix mult + + // // Multiply bindMatrix with matrix + //__global const float4* bindMatrix = &(bindMatrices[positionId*4]); + // float4 bindMatrix0 = bindMatrix[0]; + // float4 bindMatrix1 = bindMatrix[1]; + // float4 bindMatrix2 = bindMatrix[2]; + // float4 m0 = (float4)(dot(bindMatrix0, matrix0), + // dot(bindMatrix0, matrix1), + // dot(bindMatrix0, matrix2), + // dot(bindMatrix0, matrix3)); + // float4 m1 = (float4)(dot(bindMatrix1, matrix0), + // dot(bindMatrix1, matrix1), + // dot(bindMatrix1, matrix2), + // dot(bindMatrix1, matrix3)); + // float4 m2 = (float4)(dot(bindMatrix2, matrix0), + // dot(bindMatrix2, matrix1), + // dot(bindMatrix2, matrix2), + // dot(bindMatrix2, matrix3)); + + float4 initialPosition = (float4)(initialPos[positionOffset], + initialPos[positionOffset+1], + initialPos[positionOffset+2], + 1.0f); + /*finalPos[positionOffset] = dot(initialPosition, m0); + finalPos[positionOffset+1] = dot(initialPosition, m1); + finalPos[positionOffset+2] = dot(initialPosition, m2);*/ + finalPos[positionOffset] = hitPointX; + finalPos[positionOffset+1] = hitPointY; + finalPos[positionOffset+2] = hitPointZ; } \ No newline at end of file