Shyrma pad (#6443)

* rewrite pad op * add more tests for pad op * testing and fixing bugs in new pad op * start to write batchnorm_new * finish implementation of batchnorm_new and testing it
deeplearning4j · Sep 14, 2018 · 27b7268 · 27b7268
1 parent 0a80076
commit 27b7268
Show file tree

Hide file tree

Showing 8 changed files with 733 additions and 32 deletions.
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
@@ -1362,12 +1362,12 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
         }
 
 
-        if (shape::isVector(shape, rank)) {
-            for (int i = 0; i < 2; i++)
-                stride[i] = 1;
-            return stride;
+        // if (shape::isVector(shape, rank)) {
+        //     for (int i = 0; i < 2; i++)
+        //         stride[i] = 1;
+        //     return stride;
 
-        }
+        // }
 
         int st = startNum;
         for (int j = rank - 1; j >= 0; j--) {
@@ -1384,12 +1384,12 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
             return ret;
         }
 
-        if (shape::isVector(shape, rank)) {
-            for (int i = 0; i < 2; i++)
-                ret[i] = 1;
-            return ret;
+        // if (shape::isVector(shape, rank)) {
+        //     for (int i = 0; i < 2; i++)
+        //         ret[i] = 1;
+        //     return ret;
 
-        }
+        // }
 
         int st = startNum;
         for (int j = rank - 1; j >= 0; j--) {

diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@@ -76,12 +76,9 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
     else 
         output->assign(inputMinusMean * sigmaInvGam);
 
-    STORE_RESULT(*output);
-
     return Status::OK();
 }
 
-//////////////////////////////////////////////////////////////////////////
 DECLARE_SHAPE_FN(batchnorm) {        
 
     std::vector<const NDArray<T>*> inArrs(block.width());
@@ -96,6 +93,95 @@ DECLARE_SHAPE_FN(batchnorm) {
     return SHAPELIST(outShapeInfo);
 }
 
+//////////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(batchnorm_new, 3, 1, false, 1, 2) {    
+
+    NDArray<T>* input    = INPUT_VARIABLE(0);
+    NDArray<T>* mean     = INPUT_VARIABLE(1);
+    NDArray<T>* variance = INPUT_VARIABLE(2);
+    NDArray<T>* gamma    = nullptr;
+    NDArray<T>* beta     = nullptr;
+
+    NDArray<T>* output   = OUTPUT_VARIABLE(0);
+
+    const bool applyScale  = (bool)INT_ARG(0);
+    const bool applyOffset = (bool)INT_ARG(1);
+    const T    epsilon     = T_ARG(0);
+
+    if(applyScale)
+        gamma = INPUT_VARIABLE(3);    
+    if(applyOffset)
+        beta = INPUT_VARIABLE(3 + static_cast<int>(applyScale));    
+
+    const int numOfIntArgs = block.getIArguments()->size();
+    const int inRank = input->rankOf();
+
+    // get axes args to normalize input array over
+    std::vector<int> axes;    
+    if(numOfIntArgs > 2)
+        for(int i = 2; i < numOfIntArgs; ++i)
+            axes.push_back(INT_ARG(i));
+    else
+        axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
+
+    const int numOfAxes = axes.size();
+    REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_NEW op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
+
+    // get, for example, something like {1, inDim1, 1, inDim3, 1} if axes = {1, 3}
+    std::vector<Nd4jLong> expShapeWithUnities(inRank, 1);
+    for(int i = 0; i < numOfAxes; ++i)
+        expShapeWithUnities[axes[i]] = input->sizeAt(axes[i]);     
+
+    // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
+    // for example if input shape is {2,3,4,5,6} and axes = {1,3}, then expected shape would be {1,3,1,5,1}, and if axes = {3}, then expected shape would be {5}
+    std::vector<Nd4jLong> expShape = numOfAxes == 1 ? std::vector<Nd4jLong>(1, input->sizeAt(axes[0])) : expShapeWithUnities;    
+    std::string expShapeStr = ShapeUtils<T>::shapeAsString(expShape);
+
+    REQUIRE_TRUE(ShapeUtils<T>::shapeAsString(mean)     == expShapeStr, 0, "BATCHNORM_NEW op: wrong shape of mean array, expected is %s, but got %s instead !", expShapeStr.c_str(), ShapeUtils<T>::shapeAsString(mean).c_str());
+    REQUIRE_TRUE(ShapeUtils<T>::shapeAsString(variance) == expShapeStr, 0, "BATCHNORM_NEW op: wrong shape of variance array, expected is %s, but got %s instead !", expShapeStr.c_str(), ShapeUtils<T>::shapeAsString(variance).c_str());
+    if(gamma)
+        REQUIRE_TRUE(ShapeUtils<T>::shapeAsString(variance) == expShapeStr, 0, "BATCHNORM_NEW op: wrong shape of gamma array, expected is %s, but got %s instead !", expShapeStr.c_str(), ShapeUtils<T>::shapeAsString(gamma).c_str());
+    if(beta)
+        REQUIRE_TRUE(ShapeUtils<T>::shapeAsString(beta) == expShapeStr, 0, "BATCHNORM_NEW op: wrong shape of beta array, expected is %s, but got %s instead !", expShapeStr.c_str(), ShapeUtils<T>::shapeAsString(beta).c_str());  
+
+    // normalized output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta
+
+    if(numOfAxes == 1 && inRank > 1) {
+        mean     = mean->reshape(mean->ordering(), expShapeWithUnities);
+        variance = variance->reshape(variance->ordering(), expShapeWithUnities);
+        if(gamma)
+            gamma = gamma->reshape(gamma->ordering(), expShapeWithUnities);
+        if(beta)
+            beta  = beta->reshape(beta->ordering(), expShapeWithUnities);
+    }
+
+    NDArray<T> sigmaInvGam = (*variance + epsilon).template transform<simdOps::RSqrt<T>>();
+    if(applyScale)
+        sigmaInvGam *= *gamma;
+
+    if (applyOffset)
+        output->assign((*input - *mean) * sigmaInvGam + *beta);
+    else 
+        output->assign((*input - *mean) * sigmaInvGam);
+
+    if(numOfAxes == 1 && inRank > 1) {
+        delete mean; 
+        delete variance;
+        delete gamma;
+        delete beta;
+    }
+
+    return Status::OK();
+}
+
+DECLARE_SHAPE_FN(batchnorm_new) {        
+
+    Nd4jLong* outShapeInfo = nullptr;
+
+    COPY_SHAPE(inputShape->at(0), outShapeInfo);    // output shape is identical to input shape
+
+    return SHAPELIST(outShapeInfo);
+}
 
 //////////////////////////////////////////////////////////////////////////
 CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
@@ -195,7 +281,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     return Status::OK();
 }
 
-//////////////////////////////////////////////////////////////////////////
+
 DECLARE_SHAPE_FN(batchnorm_bp) {
 
     const bool applyScale  = (bool)INT_ARG(0);

diff --git a/libnd4j/include/ops/declarable/generic/transforms/pad.cpp b/libnd4j/include/ops/declarable/generic/transforms/pad.cpp
@@ -34,7 +34,8 @@ CUSTOM_OP_IMPL(pad, 2, 1, false, 0, 1) {
 
     NDArray<T>* input    = INPUT_VARIABLE(0);
     NDArray<T>* paddings = INPUT_VARIABLE(1);
-    NDArray<T>* output   = OUTPUT_VARIABLE(0);
+    NDArray<T>* output   = OUTPUT_VARIABLE(0);    
+
     std::vector<int>* argI = block.getIArguments();
 
     const int rank =  input->rankOf();    	
@@ -44,26 +45,30 @@ CUSTOM_OP_IMPL(pad, 2, 1, false, 0, 1) {
 	std::string currentPaddingsShape  = ShapeUtils<T>::shapeAsString(paddings);        
 	REQUIRE_TRUE(expectedPaddingsShape == currentPaddingsShape, 0, "PAD op: wrong shape of paddings array, expected is %s, but got %s instead !", expectedPaddingsShape.c_str(), currentPaddingsShape.c_str());
 	T padValue = T(0.f);
+
 	// in case of REFLECT and SYMMETRIC modes paddings must obey additional shape requirements 
 	// REFLECT case
-	if (argI->at(0) == 0) { // CONSTAND mode
+	if (INT_ARG(0) == 0) { // CONSTAND mode
 	    if (!block.getTArguments()->empty())
 	        padValue = T_ARG(0);
     }
-    else if(argI->at(0) == 1)
+    else if(INT_ARG(0) == 1) {		// REFLECT mode
 		for(int dim=0; dim < rank; ++dim)
 			REQUIRE_TRUE((*paddings)(dim,0) <= (input->shapeOf()[dim]-1) && (*paddings)(dim,1) <= (input->shapeOf()[dim]-1), 0, "PAD op: wrong content of paddings array for REFLECT mode !");
-	// SYMMETRIC case
-	if(argI->at(0) == 2)				
+    }	
+	if(INT_ARG(0) == 2) {		// SYMMETRIC mode
 		for(int dim=0; dim < rank; ++dim)
 			REQUIRE_TRUE((*paddings)(dim,0) <= input->shapeOf()[dim] && (*paddings)(dim,1)  <= input->shapeOf()[dim], 0, "PAD op: wrong content of paddings array for SYMMETRIC mode !");
+	}
+
 	// CONSTANT->0, REFLECT->1, SYMMETRIC->2
-    REQUIRE_TRUE(!(argI->at(0) < 0 || argI->at(0) > 2), 0, "PAD op: unknown padding mode, there are only three possible legal values -> 0,1,2, but got %i instead !", argI->at(0));
+    REQUIRE_TRUE(INT_ARG(0) >= 0 && INT_ARG(0) <= 2, 0, "PAD op: unknown padding mode, there are only three possible legal values -> 0,1,2, but got %i instead !", INT_ARG(0));
 
 	std::vector<int> dimensions(input->rankOf());
     std::iota(dimensions.begin(), dimensions.end(), 0);   			// fill with 0, 1, ... rank-1
 
-	helpers::recursiveLoopForPad<T>(argI->at(0), *input, *paddings, *output, dimensions, 0, 0, 0, padValue);
+	// helpers::recursiveLoopForPad<T>(INT_ARG(0), *input, *paddings, *output, dimensions, 0, 0, 0, padValue);
+	helpers::pad(INT_ARG(0), *input, *paddings, *output, padValue);
 
     return Status::OK();
 }

diff --git a/libnd4j/include/ops/declarable/headers/nn.h b/libnd4j/include/ops/declarable/headers/nn.h
@@ -107,6 +107,9 @@ namespace nd4j {
         #if NOT_EXCLUDED(OP_batchnorm)
         DECLARE_CUSTOM_OP(batchnorm, 3, 1, false, 1, 2);
         #endif
+        #if NOT_EXCLUDED(OP_batchnorm)
+        DECLARE_CUSTOM_OP(batchnorm_new, 3, 1, false, 1, 2);
+        #endif
 
         /**
         * back prop in batch normalization
@@ -163,7 +166,7 @@ namespace nd4j {
          * scale:  1D input array of scale factors, shape [iD]
          * offset: 1D input array of offsets (shifts), shape [iD]
          * mean: 1D input array of population mean used for inference, shape [iD], this array is required only if isTraining = false
-         * variance: 1D input array of population mean used for inference, shape [iD], this array is required only if isTraining = false         
+         * variance: 1D input array of population mean used for inference, shape [iD], this array is required only if isTraining = false
          * 
          * T input arguments:
          * 0: epsilon, it is optional argument, default value is 0.001, this is small number to be added to the variance of x

diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -183,14 +183,148 @@ void randomShuffle(NDArray<T>& input, NDArray<T>& output, nd4j::random::RandomBu
 
 }
 
+//////////////////////////////////////////////////////////////////////////
+template<typename T>
+void pad(const int mode, const NDArray<T>& input, const NDArray<T>& paddings, NDArray<T>& output, const T padValue ) {
+
+    const int rank = output.rankOf();
+    std::vector<int> dimsToExclude(rank);
+    std::iota(dimsToExclude.begin(), dimsToExclude.end(), 0);             // fill with 0, 1, ... rank-1    
+
+    Nd4jLong numLeft    = paddings(rank-1,0);
+    Nd4jLong numRight   = paddings(rank-1,1);
+    Nd4jLong inDimSize  = input.sizeAt(rank-1);
+    Nd4jLong outDimSize = output.sizeAt(rank-1);
+
+    std::vector<std::vector<Nd4jLong>> outIdx = { std::vector<Nd4jLong>(2*rank), {numLeft, numLeft + inDimSize}, {0, numLeft}, {numLeft + inDimSize, outDimSize} };
+
+    for(int i = 0; i < rank-1; ++i) {
+        outIdx[0][2*i]     = paddings(i, 0);
+        outIdx[0][2*i + 1] = outIdx[0][2*i] + input.sizeAt(i);
+    }    
+    outIdx[0][2*rank-1] = outIdx[0][2*rank-2] = 0;
+
+    // ***** populate innermost sub-arrays firstly ***** //
+    dimsToExclude.pop_back();    
+
+    Nd4jLong startL = mode == 1 ? 1 : 0;                            // REFLECT or SYMMETRIC
+    Nd4jLong startR = mode == 1 ? inDimSize-2 : inDimSize-1;        // REFLECT or SYMMETRIC
+
+    Nd4jLong numOfSubArrs = ShapeUtils<T>::getNumOfSubArrs(input.getShapeInfo(), dimsToExclude);
+
+    NDArray<T> outSubArr0 = output(outIdx[0], true);
+
+#pragma omp parallel for schedule(guided)
+    for(Nd4jLong j = 0; j < numOfSubArrs; ++j) {
+
+        NDArray<T> outSubArr1   = outSubArr0(j, dimsToExclude);
+        NDArray<T> inSubArr     = input(j, dimsToExclude);        
+        NDArray<T> outSubArrMid = outSubArr1(outIdx[1]);
+
+        outSubArrMid.assign(inSubArr);      // assign middle
+
+        if(mode == 0)  { // CONSTANT
+            if(numLeft != 0) {
+                NDArray<T> temp = outSubArr1(outIdx[2]);
+                temp = padValue;                        // assign left                     
+            }
+            if(numRight != 0) {
+                NDArray<T> temp = outSubArr1(outIdx[3]);
+                temp = padValue;                        // assign right
+            }
+        }
+        else {                                                              // REFLECT or SYMMETRIC
+
+            for(Nd4jLong k = numLeft-1, e = startL; k >= 0; --k, ++e)     // fill left side             
+                outSubArr1(k) = inSubArr(e);            
+
+            for(Nd4jLong k = numLeft + inDimSize, e = startR; k < outDimSize; ++k, --e)     // fill right side
+                outSubArr1(k) = inSubArr(e);                        
+        }
+    }        
+
+    // ***** fill rest of outer sub-arrays ***** //    
+    std::vector<Nd4jLong> outIdxInner(2,0);
+    std::vector<Nd4jLong> outIdxOuter(2,0);
+
+    for(int i = rank - 2; i >= 0; --i) {
+
+        dimsToExclude.pop_back();
+
+        outIdxInner.push_back(0), outIdxInner.push_back(0);
+        outIdxOuter.push_back(0), outIdxOuter.push_back(0);
+
+        Nd4jLong numLeft  = paddings(i,0);
+        Nd4jLong numRight = paddings(i,1);
+
+        if(numLeft == 0 && numRight == 0)
+            continue;
+
+        Nd4jLong inDimSize  = input.sizeAt(i);
+        Nd4jLong outDimSize = output.sizeAt(i);
+
+        if(mode == 0) {
+            outIdxOuter[0] = 0;                   outIdxOuter[1] = numLeft;
+            outIdxInner[0] = numLeft + inDimSize; outIdxInner[1] = outDimSize;
+        }
+
+        startL = mode == 1 ? numLeft+1 : numLeft;                            // REFLECT or SYMMETRIC
+        startR = mode == 1 ? numLeft+inDimSize-2 : numLeft+inDimSize-1;      // REFLECT or SYMMETRIC
+
+        numOfSubArrs = ShapeUtils<T>::getNumOfSubArrs(output.getShapeInfo(), dimsToExclude);
+
+#pragma omp parallel for schedule(guided) firstprivate(outIdxOuter, outIdxInner)
+        for(Nd4jLong j = 0; j < numOfSubArrs; ++j) {
+
+            NDArray<T> outSubArr = output(j, dimsToExclude);
+
+            if(mode == 0)  { // CONSTANT
+
+                if(numLeft != 0) {                   
+                    NDArray<T> temp = outSubArr(outIdxOuter);
+                    temp = padValue;                              // assign left 
+                }
+
+                if(numRight != 0) {                   
+                    NDArray<T> temp = outSubArr(outIdxInner);
+                    temp = padValue;                              // assign right
+                }
+            }
+            else {                                                              // REFLECT or SYMMETRIC
+
+                for(Nd4jLong k = numLeft-1, e = startL; k >= 0; --k, ++e) {    // fill left side
+                    outIdxOuter[0] = k;
+                    outIdxOuter[1] = k+1;
+                    outIdxInner[0] = e;
+                    outIdxInner[1] = e+1;
+                    NDArray<T> outSubArrInner = outSubArr(outIdxInner);
+                    NDArray<T> outSubArrOuter = outSubArr(outIdxOuter);
+                    outSubArrOuter.assign(outSubArrInner);
+                }
+
+                for(Nd4jLong k = numLeft + inDimSize, e = startR; k < outDimSize; ++k, --e) {    // fill right side
+                    outIdxOuter[0] = k;
+                    outIdxOuter[1] = k+1;
+                    outIdxInner[0] = e;
+                    outIdxInner[1] = e+1;
+                    NDArray<T> outSubArrInner = outSubArr(outIdxInner);
+                    NDArray<T> outSubArrOuter = outSubArr(outIdxOuter);
+                    outSubArrOuter.assign(outSubArrInner);
+                }
+            }
+        }        
+    }
+}
+
+
 
 ////////////////////////////////////////////////////////////////////////
-// initial values of inIdx, outIdx, dim must be equal to zero
+/*// initial values of inIdx, outIdx, dim must be equal to zero
 template<typename T>
 void recursiveLoopForPad(const int mode, NDArray<T>& input, const NDArray<T>& paddings, NDArray<T>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, T padValue ) {
     
     int leftOffset;
-    // dimensions are array of input dimensions, it is sorted by increasing order
+    // dimensions are array of input dimensions, it is sorted in increasing order
     // every time at the beginning we erase first element from it (not good idea to use vector for this purpose, but luckily it is small enough)
     // then we use this array for tads building, every time while recursion the number of built tads becomes bigger 
     dimensions.erase(dimensions.begin());       
@@ -322,7 +456,7 @@ void recursiveLoopForPad(const int mode, NDArray<T>& input, const NDArray<T>& pa
             break;
     }
 }
-
+*/
 
 ////////////////////////////////////////////////////////////////////////
 template<typename T>
@@ -1011,9 +1145,13 @@ template void randomShuffle<float>(NDArray<float>& input, NDArray<float>& output
 template void randomShuffle<float16>(NDArray<float16>& input, NDArray<float16>& output, nd4j::random::RandomBuffer& rng, const bool isInplace);
 template void randomShuffle<double>(NDArray<double>& input, NDArray<double>& output, nd4j::random::RandomBuffer& rng, const bool isInplace);
 
-template void recursiveLoopForPad<float>(const int mode, NDArray<float>& input, const NDArray<float>& paddings, NDArray<float>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, float padValue);
-template void recursiveLoopForPad<float16>(const int mode, NDArray<float16>& input, const NDArray<float16>& paddings, NDArray<float16>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, float16 padValue);
-template void recursiveLoopForPad<double>(const int mode, NDArray<double>& input, const NDArray<double>& paddings, NDArray<double>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, double padValue);
+// template void recursiveLoopForPad<float>(const int mode, NDArray<float>& input, const NDArray<float>& paddings, NDArray<float>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, float padValue);
+// template void recursiveLoopForPad<float16>(const int mode, NDArray<float16>& input, const NDArray<float16>& paddings, NDArray<float16>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, float16 padValue);
+// template void recursiveLoopForPad<double>(const int mode, NDArray<double>& input, const NDArray<double>& paddings, NDArray<double>& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, double padValue);
+
+template void pad<float16>(const int mode, const NDArray<float16>& input, const NDArray<float16>& paddings, NDArray<float16>& output, const float16 padValue);
+template void pad<float>(const int mode, const NDArray<float>& input, const NDArray<float>& paddings, NDArray<float>& output, const float padValue);
+template void pad<double>(const int mode, const NDArray<double>& input, const NDArray<double>& paddings, NDArray<double>& output, const double padValue);
 
 template void invertPermutation<float>(const NDArray<float>& input, NDArray<float>& output);
 template void invertPermutation<float16>(const NDArray<float16>& input, NDArray<float16>& output);