diff --git a/cgcmake b/cgcmake
index fb1fd4d..b673fc3 160000
--- a/cgcmake
+++ b/cgcmake
@@ -1 +1 @@
-Subproject commit fb1fd4da8f1612bde6594ae899a36ac685605c1e
+Subproject commit b673fc30af39b9bf1d5a96237fd000df069449c2
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9110ea3..2578032 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,8 +10,9 @@ set(SOURCE_FILES
     #"cvWrapRebindCmd.h"
     "common.cpp"
     "common.h"
+    "cvwrap.cl"
 )
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MAYA_CXX_FLAGS} /arch:AVX")
 find_package(Maya REQUIRED)
 
 include_directories(${MAYA_INCLUDE_DIR})
diff --git a/src/common.cpp b/src/common.cpp
index 040f11e..d20987d 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -421,13 +421,13 @@ void GetValidUpAndNormal(const MDoubleArray& weights, const MPointArray& points,
   // Adjust up if it's parallel to normal or if it's zero length
   if (abs((unitUp * normal) - 1.0) < 0.001 || up.length() < 0.0001) {
     for (unsigned int j = 0; j < weights.length()-1; ++j) {
+      up -= (points[sampleIds[j]] - origin) * weights[j];
+      unitUp = up.normal();
       if (abs((unitUp * normal) - 1.0) > 0.001 && up.length() > 0.0001) {
         // If the up and normal vectors are no longer parallel and the up vector has a length,
         // then we are good to go.
         break;
       }
-      up -= (points[sampleIds[j]] - origin) * weights[j];
-      unitUp = up.normal();
     }
     up.normalize();
   } else {
diff --git a/src/cvWrapDeformer.cpp b/src/cvWrapDeformer.cpp
index e56f7dc..5fd39d2 100644
--- a/src/cvWrapDeformer.cpp
+++ b/src/cvWrapDeformer.cpp
@@ -249,6 +249,7 @@ MStatus CVWrap::deform(MDataBlock& data, MItGeometry& itGeo, const MMatrix& loca
     taskData_.resize(geomIndex+1);
   }
   TaskData& taskData = taskData_[geomIndex];
+  std::cerr << "deform\n";
   
   // Get driver geo
   MDataHandle hDriverGeo = data.inputValue(aDriverGeo, &status);
@@ -380,13 +381,12 @@ MThreadRetVal CVWrap::EvaluateWrap(void *pParam) {
 
     MPoint origin;
     MVector normal, up;
-    CalculateBasisComponents(sampleWeights[index], baryCoords[index], triangleVerts[i],
+    CalculateBasisComponents(sampleWeights[index], baryCoords[index], triangleVerts[index],
                              driverPoints, driverNormals, sampleIds[index], alignedStorage,
                              origin, up, normal);
 
     CreateMatrix(origin, normal, up, matrix);
     matrix = scaleMatrix * matrix;
-    MPoint dpt = points[i];
     MMatrix tempMatrix = bindMatrices[index] * matrix;
     MPoint newPt = ((points[i]  * drivenMatrix) * (bindMatrices[index] * matrix)) * drivenInverseMatrix;
     points[i] = points[i] + ((newPt - points[i]) * paintWeights[i] * env);
@@ -435,7 +435,6 @@ CVWrapGPU::~CVWrapGPU() {
 
 bool CVWrapGPU::ValidateNode(MDataBlock& block, const MEvaluationNode& evaluationNode,
                              const MPlug& plug, MStringArray* messages) {
-  std::cerr << "CVWrapGPU::ValidateNode\n";
   return true;
 }
 
@@ -447,21 +446,30 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block,
                                                    const MAutoCLEvent inputEvent,
                                                    MAutoCLMem outputBuffer,
                                                    MAutoCLEvent& outputEvent) {
+  MStatus status;
   // evaluate has two main pieces of work.  I need to transfer any data I care about onto the GPU, and I need to run my OpenCL Kernel.
   // First, transfer the data.  offset has two pieces of data I need to transfer to the GPU, the weight array and the offset matrix.
   // I don't need to transfer down the input position buffer, that is already handled by the deformer evaluator, the points are in inputBuffer.
   numElements_ = numElements;
-  EnqueueBindData(block, evaluationNode, plug);
-  EnqueueDriverData(block, evaluationNode, plug);
-  EnqueuePaintMapData(block, evaluationNode, numElements, plug);
+  status = EnqueueBindData(block, evaluationNode, plug);
+  CHECK_MSTATUS(status);
+  status = EnqueueDriverData(block, evaluationNode, plug);
+  CHECK_MSTATUS(status);
+  status = EnqueuePaintMapData(block, evaluationNode, numElements, plug);
+  CHECK_MSTATUS(status);
 
   // Now that all the data we care about is on the GPU, setup and run the OpenCL Kernel
   if (!kernel_.get())  {
     // Load the OpenCL kernel if we haven't yet.
     MString openCLKernelFile(pluginLoadPath);
     openCLKernelFile += "/cvwrap.cl";
-    kernel_ = MOpenCLInfo::getOpenCLKernel(openCLKernelFile, CVWrap::kName);
+    kernel_ = MOpenCLInfo::getOpenCLKernel(openCLKernelFile, "cvwrap");
+    if (kernel_.isNull())  {
+      std::cerr << "Could not compile kernel " << openCLKernelFile << "\n";
+      return MPxGPUDeformer::kDeformerFailure;
+    }
   }
+  std::cerr << "Evaluate\n";
 
   cl_int err = CL_SUCCESS;
   
@@ -472,11 +480,27 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block,
   MOpenCLInfo::checkCLErrorStatus(err);
   err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)inputBuffer.getReadOnlyRef());
   MOpenCLInfo::checkCLErrorStatus(err);
-  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)fCLWeights.getReadOnlyRef());
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)driverPoints_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)driverNormals_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)paintWeights_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleCounts_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleOffsets_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleIds_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)sampleWeights_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)triangleVerts_.getReadOnlyRef());
   MOpenCLInfo::checkCLErrorStatus(err);
-  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)fOffsetMatrix.getReadOnlyRef());
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)baryCoords_.getReadOnlyRef());
   MOpenCLInfo::checkCLErrorStatus(err);
-  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_uint), (void*)&fNumElements);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_mem), (void*)bindMatrices_.getReadOnlyRef());
+  MOpenCLInfo::checkCLErrorStatus(err);
+  err = clSetKernelArg(kernel_.get(), parameterId++, sizeof(cl_uint), (void*)&numElements_);
   MOpenCLInfo::checkCLErrorStatus(err);
 
   // Figure out a good work group size for our kernel.
@@ -492,14 +516,16 @@ MPxGPUDeformer::DeformerStatus CVWrapGPU::evaluate(MDataBlock& block,
   MOpenCLInfo::checkCLErrorStatus(err);
 
   size_t localWorkSize = 256;
-  if (retSize > 0) localWorkSize = workGroupSize;
-  size_t globalWorkSize = (localWorkSize - fNumElements % localWorkSize) + fNumElements; // global work size must be a multiple of localWorkSize
+  if (retSize > 0) {
+    localWorkSize = workGroupSize;
+  }
+  // global work size must be a multiple of localWorkSize
+  size_t globalWorkSize = (localWorkSize - numElements_ % localWorkSize) + numElements_;
 
   // set up our input events.  The input event could be NULL, in that case we need to pass
   // slightly different parameters into clEnqueueNDRangeKernel
   unsigned int numInputEvents = 0;
-  if (inputEvent.get())
-  {
+  if (inputEvent.get()) {
     numInputEvents = 1;
   }
 
@@ -556,13 +582,17 @@ MStatus CVWrapGPU::EnqueueBindData(MDataBlock& data, const MEvaluationNode& eval
   // Store samples per vertex
   arraySize = taskData.sampleIds.size();
   int* samplesPerVertex = new int[arraySize];
+  int* sampleOffsets = new int[arraySize];
   int totalSamples = 0;
   for(size_t i = 0; i < taskData.sampleIds.size(); ++i) {
     samplesPerVertex[i] = (int)taskData.sampleIds[i].length();
     totalSamples += samplesPerVertex[i];
+    sampleOffsets[i] = totalSamples;
   }
   err = EnqueueBuffer(sampleCounts_, arraySize * sizeof(int), (void*)samplesPerVertex);
+  err = EnqueueBuffer(sampleOffsets_, arraySize * sizeof(int), (void*)sampleOffsets);
   delete [] samplesPerVertex;
+  delete [] sampleOffsets;
 
   // Store sampleIds and sampleWeights
   int* sampleIds = new int[totalSamples];
@@ -596,7 +626,7 @@ MStatus CVWrapGPU::EnqueueBindData(MDataBlock& data, const MEvaluationNode& eval
   err = EnqueueBuffer(baryCoords_, arraySize * sizeof(float), (void*)baryCoords);
   delete [] triangleVerts;
   delete [] baryCoords;
-
+  return MS::kSuccess;
 }
 
 
@@ -680,5 +710,21 @@ MStatus CVWrapGPU::EnqueuePaintMapData(MDataBlock& data,
   delete [] paintWeights;
   return MS::kSuccess;
 }
+
+
+void CVWrapGPU::terminate() {
+  driverPoints_.reset();
+  driverNormals_.reset();
+  paintWeights_.reset();
+  bindMatrices_.reset();
+  sampleCounts_.reset();
+  sampleIds_.reset();
+  sampleWeights_.reset();
+  triangleVerts_.reset();
+  baryCoords_.reset();
+	MOpenCLInfo::releaseOpenCLKernel(kernel_);
+	kernel_.reset();
+}
+
 #endif
 
diff --git a/src/cvWrapDeformer.h b/src/cvWrapDeformer.h
index 6c74e7a..0168463 100644
--- a/src/cvWrapDeformer.h
+++ b/src/cvWrapDeformer.h
@@ -121,6 +121,7 @@ class CVWrapGPU : public MPxGPUDeformer {
 	MAutoCLMem paintWeights_;
 	MAutoCLMem bindMatrices_;
 	MAutoCLMem sampleCounts_;
+	MAutoCLMem sampleOffsets_;
 	MAutoCLMem sampleIds_;
 	MAutoCLMem sampleWeights_;
 	MAutoCLMem triangleVerts_;
diff --git a/src/cvwrap.cl b/src/cvwrap.cl
index f095f2c..3c3bd06 100644
--- a/src/cvwrap.cl
+++ b/src/cvwrap.cl
@@ -2,43 +2,149 @@
 	offset kernels
 */
 
-__kernel void offset(
-	__global float* finalPos,							//float3
-	__global const float* initialPos,					//float3
-	__global const float* weights,
-	__global const float4* matrices,					//first matrix is offset matrix, second matrix is offset matrix inverse
+__kernel void cvwrap(
+	__global float* finalPos,            //float3
+	__global const float* initialPos,    //float3
+	__global const float* driverPoints,  //float3
+	__global const float* driverNormals, //float3
+	__global const float* paintWeights,
+	__global const int* sampleCounts,
+	__global const int* sampleOffsets,
+	__global const int* sampleIds,
+	__global const float* sampleWeights,
+	__global const int* triangleVerts,  //int3
+	__global const float* baryCoords,  //float3
+	__global const float4* bindMatrices,
 	const uint positionCount)
 {
 	unsigned int positionId = get_global_id(0);				// access finalPos and initialPos using this value
-	if (positionId >= positionCount ) return;					// We create an execute unit for more indices then we have data for, just exit early if this guy if one of the extras
+	if (positionId >= positionCount) {
+    // We create an execute unit for more indices then we have data for, just exit early if this guy if one of the extras
+    return;					
+  }
 	unsigned int positionOffset = positionId * 3;				// Base positions are float3 when they come in here!
-	float4 initialPosition;
-	initialPosition.x = initialPos[positionOffset];
-	initialPosition.y = initialPos[positionOffset+1];
-	initialPosition.z = initialPos[positionOffset+2];
-	initialPosition.w = 1.0f;
-
-	float4 finalPosition;
-	finalPosition.x = 0.0f;
-	finalPosition.y = 0.0f;
-	finalPosition.z = 0.0f;
-	finalPosition.w = 1.0f;
-
-	__global const float4* matrixInverse = &(matrices[4]);
-	__global const float4* matrix = matrices;
-
-	// point *= matrix inverse
-	finalPosition.x = dot(initialPosition, matrixInverse[0]);
-	finalPosition.y = dot(initialPosition, matrixInverse[1]);
-	finalPosition.z = dot(initialPosition, matrixInverse[2]);
-
-	// pt.y += weight
-	finalPosition.y += weights[positionId];
-
-	// point *= matrix
-	// can't write back into finalPosition here, we need to use the same value to calculate xyz
-	// instead write into global memory
-	finalPos[positionOffset] = dot(finalPosition, matrix[0]);
-	finalPos[positionOffset+1] = dot(finalPosition, matrix[1]);
-	finalPos[positionOffset+2] = dot(finalPosition, matrix[2]);
+	
+
+  // Start with the recreated point and normal using the barycentric coordinates of the hit point.
+  float baryA = baryCoords[positionOffset];
+  float baryB = baryCoords[positionOffset+1];
+  float baryC = baryCoords[positionOffset+2];
+  int triVertA = triangleVerts[positionOffset];
+  int triVertB = triangleVerts[positionOffset+1];
+  int triVertC = triangleVerts[positionOffset+2];
+  float hitPointX = driverPoints[triVertA] * baryA +
+                    driverPoints[triVertB] * baryB +
+                    driverPoints[triVertC] * baryC;
+  float hitPointY = driverPoints[triVertA+1] * baryA +
+                    driverPoints[triVertB+1] * baryB +
+                    driverPoints[triVertC+1] * baryC;
+  float hitPointZ = driverPoints[triVertA+2] * baryA +
+                    driverPoints[triVertB+2] * baryB +
+                    driverPoints[triVertC+2] * baryC;
+	float hitNormalX = driverNormals[triVertA] * baryA +
+                     driverNormals[triVertB] * baryB +
+                     driverNormals[triVertC] * baryC;
+  float hitNormalY = driverNormals[triVertA+1] * baryA +
+                     driverNormals[triVertB+1] * baryB +
+                     driverNormals[triVertC+1] * baryC;
+  float hitNormalZ = driverNormals[triVertA+2] * baryA +
+                     driverNormals[triVertB+2] * baryB +
+                     driverNormals[triVertC+2] * baryC;
+
+  // Create the barycentric point and normal.
+ // int hitIndex = sampleOffsets[positionId] + sampleCounts[positionId] - 1;
+ // float hitWeight = sampleWeights[hitIndex];
+ // float originX = hitPointX * hitWeight;
+ // float originY = hitPointY * hitWeight;
+ // float originZ = hitPointZ * hitWeight;
+ // float normalX = hitNormalX * hitWeight;
+ // float normalY = hitNormalY * hitWeight;
+ // float normalZ = hitNormalZ * hitWeight;
+
+ // // Then use the weighted adjacent data.
+ // for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) {
+ //   float sw = sampleWeights[j];
+ //   originX += driverPoints[sampleIds[j]*3]   * sw;
+ //   originY += driverPoints[sampleIds[j]*3+1] * sw;
+ //   originZ += driverPoints[sampleIds[j]*3+2] * sw;
+
+ //   normalX += driverNormals[sampleIds[j]*3]   * sw;
+ //   normalY += driverNormals[sampleIds[j]*3+1] * sw;
+ //   normalZ += driverNormals[sampleIds[j]*3+2] * sw;
+ // }
+
+ // // Calculate the up vector
+ // float upX = (hitPointX - originX) * hitWeight;
+ // float upY = (hitPointY - originY) * hitWeight;
+ // float upZ = (hitPointZ - originZ) * hitWeight;
+ // for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) {
+ //   float sw = sampleWeights[j];
+ //   upX += (driverPoints[sampleIds[j]*3] - originX) * sw;
+ //   upY += (driverPoints[sampleIds[j]*3+1] - originY) * sw;
+ //   upZ += (driverPoints[sampleIds[j]*3+2] - originZ) * sw;
+ // }
+
+ // // Use float3 so we can use the built-in functions.  We are mostly using single floats
+ // // because the preferred vector width of most gpu's these days is 1.
+ // float3 up = (float3)(upX, upY, upZ);
+ // float3 normal = (float3)(normalX, normalY, normalZ);
+ // float3 unitUp = fast_normalize(up);
+ // float upLength = fast_length(up);
+ // if (fabs(dot(unitUp, normal) - 1.0f) > 0.001f && upLength > 0.0001f) {
+ //   for (uint j = sampleOffsets[positionId]; j < hitIndex; j++) {
+ //     up.x -= (driverPoints[sampleIds[j]*3] - originX) * sampleWeights[j];
+ //     up.y -= (driverPoints[sampleIds[j]*3+1] - originY) * sampleWeights[j];
+ //     up.z -= (driverPoints[sampleIds[j]*3+2] - originZ) * sampleWeights[j];
+ //     unitUp = fast_normalize(up);
+ //     upLength = fast_length(up);
+ //     if (fabs(dot(unitUp, normal) - 1.0f) > 0.001f && upLength > 0.0001f) {
+ //       // If the up and normal vectors are no longer parallel and the up vector has a length,
+ //       // then we are good to go.
+ //       break;
+ //     }
+ //   }
+ //   up = fast_normalize(up);
+ // } else {
+ //   up = unitUp;
+ // }
+
+ // // Create the transform matrix
+ // // Store by columns so we can use dot to multiply with the bind matrix
+ // float3 x = cross(normal, up);
+ // float3 z = cross(normal, x);
+ // float4 matrix0 = (float4)(x.x, normal.x, z.x, originX);
+ // float4 matrix1 = (float4)(x.y, normal.y, z.y, originY);
+ // float4 matrix2 = (float4)(x.z, normal.z, z.z, originZ);
+ // float4 matrix3 = (float4)(0.0f, 0.0f, 0.0f, 1.0f);
+
+ // // TODO: scale matrix mult
+
+ // // Multiply bindMatrix with matrix
+	//__global const float4* bindMatrix = &(bindMatrices[positionId*4]);
+ // float4 bindMatrix0 = bindMatrix[0];
+ // float4 bindMatrix1 = bindMatrix[1];
+ // float4 bindMatrix2 = bindMatrix[2];
+ // float4 m0 = (float4)(dot(bindMatrix0, matrix0),
+ //                      dot(bindMatrix0, matrix1),
+ //                      dot(bindMatrix0, matrix2),
+ //                      dot(bindMatrix0, matrix3));
+ // float4 m1 = (float4)(dot(bindMatrix1, matrix0),
+ //                      dot(bindMatrix1, matrix1),
+ //                      dot(bindMatrix1, matrix2),
+ //                      dot(bindMatrix1, matrix3));
+ // float4 m2 = (float4)(dot(bindMatrix2, matrix0),
+ //                      dot(bindMatrix2, matrix1),
+ //                      dot(bindMatrix2, matrix2),
+ //                      dot(bindMatrix2, matrix3));
+
+	float4 initialPosition = (float4)(initialPos[positionOffset],
+	                                  initialPos[positionOffset+1],
+	                                  initialPos[positionOffset+2],
+                                    1.0f);
+	/*finalPos[positionOffset] = dot(initialPosition, m0);
+	finalPos[positionOffset+1] = dot(initialPosition, m1);
+	finalPos[positionOffset+2] = dot(initialPosition, m2);*/
+  finalPos[positionOffset] = hitPointX;
+	finalPos[positionOffset+1] = hitPointY;
+	finalPos[positionOffset+2] = hitPointZ;
 }
\ No newline at end of file