Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CuDNN dropout support #5501

Merged
merged 7 commits into from
Jun 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,31 @@ public boolean checkSupported() {
}
return supported;
}


/**
* From CuDNN documentation -
* "Tensors are restricted to having at least 4 dimensions... When working with lower dimensional data, it is
* recommended that the user create a 4Dtensor, and set the size along unused dimensions to 1."
*
* This method implements that - basically appends 1s to the end (shape or stride) to make it length 4,
* or leaves it unmodified if the length is already 4 or more.
* This method can be used for both shape and strides
*
* @param shapeOrStrides
* @return
*/
protected static int[] adaptForTensorDescr(int[] shapeOrStrides){
if(shapeOrStrides.length >= 4)
return shapeOrStrides;
int[] out = new int[4];
int i=0;
for(; i<shapeOrStrides.length; i++ ){
out[i] = shapeOrStrides[i];
}
for(; i<4; i++ ){
out[i] = 1;
}
return out;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package org.deeplearning4j.nn.layers.dropout;

import lombok.Data;
import org.bytedeco.javacpp.*;
import org.deeplearning4j.nn.conf.dropout.DropoutHelper;
import org.deeplearning4j.nn.layers.BaseCudnnHelper;
import org.nd4j.jita.allocator.Allocator;
import org.nd4j.jita.allocator.impl.AtomicAllocator;
import org.nd4j.jita.conf.CudaEnvironment;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.jcublas.context.CudaContext;
import org.nd4j.linalg.util.ArrayUtil;

import static org.bytedeco.javacpp.cudnn.*;
import static org.bytedeco.javacpp.cudnn.cudnnDestroyTensorDescriptor;

/**
* CuDNN dropout helper
*
* Note that for repeatability between calls (for example, for gradient checks), we need to do two things:
* (a) set the ND4J RNG seed
* (b) clear the rngStates field
*
* @author Alex Black
*/
@Data
public class CudnnDropoutHelper extends BaseCudnnHelper implements DropoutHelper {

private static class CudnnDropoutContext extends CudnnContext {

private static class Deallocator extends CudnnDropoutContext implements Pointer.Deallocator {
Deallocator(CudnnDropoutContext c) {
super(c);
}

@Override
public void deallocate() {
destroyHandles();
}
}

private cudnn.cudnnTensorStruct xTensorDesc = new cudnn.cudnnTensorStruct(); //Input
private cudnn.cudnnTensorStruct dxTensorDesc = new cudnn.cudnnTensorStruct(); //Grad at input
private cudnn.cudnnTensorStruct yTensorDesc = new cudnn.cudnnTensorStruct(); //Output
private cudnn.cudnnTensorStruct dyTensorDesc = new cudnn.cudnnTensorStruct(); //Grad at output
private cudnn.cudnnDropoutStruct dropoutDesc = new cudnn.cudnnDropoutStruct();

public CudnnDropoutContext() {
createHandles();
deallocator(new Deallocator(this));
}

public CudnnDropoutContext(CudnnDropoutContext c) {
super(c);
xTensorDesc = new cudnn.cudnnTensorStruct(c.xTensorDesc);
dxTensorDesc = new cudnn.cudnnTensorStruct(c.dxTensorDesc);
yTensorDesc = new cudnn.cudnnTensorStruct(c.yTensorDesc);
dyTensorDesc = new cudnn.cudnnTensorStruct(c.dyTensorDesc);
dropoutDesc = new cudnn.cudnnDropoutStruct(c.dropoutDesc);
}

@Override
protected void createHandles() {
super.createHandles();
checkCudnn(cudnnCreateTensorDescriptor(xTensorDesc));
checkCudnn(cudnnCreateTensorDescriptor(dxTensorDesc));
checkCudnn(cudnnCreateTensorDescriptor(yTensorDesc));
checkCudnn(cudnnCreateTensorDescriptor(dyTensorDesc));
checkCudnn(cudnnCreateDropoutDescriptor(dropoutDesc));
}

@Override
protected void destroyHandles() {
checkCudnn(cudnnDestroyTensorDescriptor(xTensorDesc));
checkCudnn(cudnnDestroyTensorDescriptor(dxTensorDesc));
checkCudnn(cudnnDestroyTensorDescriptor(yTensorDesc));
checkCudnn(cudnnDestroyTensorDescriptor(dyTensorDesc));
checkCudnn(cudnnDestroyDropoutDescriptor(dropoutDesc));
super.destroyHandles();
}
}

private CudnnDropoutContext cudnnContext = new CudnnDropoutContext();
private boolean initializedDescriptor = false;
private DataCache rngStates; //"Pointer to user-allocated GPU memory that will hold random number generator states."
private DataCache mask; //Mask: persistence between forward and backward
private SizeTPointer stateSizeBytesPtr;
private SizeTPointer reserveSizeBytesPtr;
private float lastInitializedP;

@Override
public void applyDropout(INDArray input, INDArray resultArray, double dropoutInputRetainProb) {
float p = (float)(1.0 - dropoutInputRetainProb); //CuDNN uses p = probability of setting to 0. We use p = probability of retaining

//TODO int cast
int[] inShape = adaptForTensorDescr(ArrayUtil.toInts(input.shape()));
int[] inStride = adaptForTensorDescr(ArrayUtil.toInts(input.stride()));
checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.xTensorDesc, dataType, inShape.length, inShape, inStride));

int[] outShape = adaptForTensorDescr(ArrayUtil.toInts(resultArray.shape()));
int[] outStride = adaptForTensorDescr(ArrayUtil.toInts(resultArray.stride()));
checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.yTensorDesc, dataType, outShape.length, outShape, outStride));


if(stateSizeBytesPtr == null){
stateSizeBytesPtr = new SizeTPointer(1);
reserveSizeBytesPtr = new SizeTPointer(1);
}
checkCudnn(cudnnDropoutGetStatesSize(cudnnContext, stateSizeBytesPtr));
long rngStateSizeBytes = stateSizeBytesPtr.get();
checkCudnn(cudnnDropoutGetReserveSpaceSize(cudnnContext.xTensorDesc, reserveSizeBytesPtr));
long maskReserveSizeBytes = reserveSizeBytesPtr.get();



//Dropout descriptor:
if(rngStates == null || rngStates.capacity() < rngStateSizeBytes){
if(rngStates != null)
rngStates.deallocate();
//states = "Pointer to user-allocated GPU memory that will hold random number generator states."
rngStates = new DataCache(rngStateSizeBytes);
initializedDescriptor = false;
}
if(mask == null || mask.capacity() < maskReserveSizeBytes){
if(mask != null)
mask.deallocate();
//mask = "Pointer to user-allocated GPU memory used by this function. It is expected
//that contents of reserveSpace doe not change between cudnnDropoutForward and
//cudnnDropoutBackward calls."
mask = new DataCache(maskReserveSizeBytes);
}

if(!initializedDescriptor || p != lastInitializedP) {
//NOTE: cudnnSetDropoutDescriptor has some internal computation/initialization, and hence is expensive to
// call - so we want to call this as infrequently as possible, and cache the result
long seed = Nd4j.getRandom().nextLong();
lastInitializedP = p;
checkCudnn(cudnnSetDropoutDescriptor(cudnnContext.dropoutDesc, cudnnContext, p, rngStates, rngStates.capacity(), seed));
initializedDescriptor = true;
}

Allocator allocator = AtomicAllocator.getInstance();
CudaContext context = allocator.getFlowController().prepareAction(input, resultArray);
Pointer xPtr = allocator.getPointer(input, context);
Pointer yPtr = allocator.getPointer(resultArray, context);

checkCudnn(cudnnSetStream(cudnnContext, new cuda.CUstream_st(context.getOldStream())));
checkCudnn(cudnnDropoutForward(cudnnContext, cudnnContext.dropoutDesc, cudnnContext.xTensorDesc, xPtr,
cudnnContext.yTensorDesc, yPtr, mask, mask.capacity()));

allocator.registerAction(context, input, resultArray);
if (CudaEnvironment.getInstance().getConfiguration().isDebug())
context.syncOldStream();
}

@Override
public void backprop(INDArray gradAtOutput, INDArray gradAtInput) {
int[] gradAtOutShape = adaptForTensorDescr(ArrayUtil.toInts(gradAtOutput.shape()));
int[] gradAtOutStride = adaptForTensorDescr(ArrayUtil.toInts(gradAtOutput.stride()));
checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.dyTensorDesc, dataType, gradAtOutShape.length, gradAtOutShape, gradAtOutStride));

int[] gradAtInShape = adaptForTensorDescr(ArrayUtil.toInts(gradAtInput.shape()));
int[] gradAtInStride = adaptForTensorDescr(ArrayUtil.toInts(gradAtInput.stride()));
checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.dxTensorDesc, dataType, gradAtInShape.length, gradAtInShape, gradAtInStride));

Allocator allocator = AtomicAllocator.getInstance();
CudaContext context = allocator.getFlowController().prepareAction(gradAtOutput, gradAtInput);
Pointer dyPtr = allocator.getPointer(gradAtOutput, context);
Pointer dxPtr = allocator.getPointer(gradAtInput, context);

checkCudnn(cudnnDropoutBackward(cudnnContext, cudnnContext.dropoutDesc, cudnnContext.dyTensorDesc, dyPtr,
cudnnContext.dxTensorDesc, dxPtr, mask, mask.capacity()));

allocator.registerAction(context, gradAtOutput, gradAtInput);
if (CudaEnvironment.getInstance().getConfiguration().isDebug())
context.syncOldStream();
}
}
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
package org.deeplearning4j.gradientcheck;

import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.BaseDL4JTest;
import org.deeplearning4j.TestUtils;
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.api.OptimizationAlgorithm;
import org.deeplearning4j.nn.conf.ConvolutionMode;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
import org.deeplearning4j.nn.conf.distribution.UniformDistribution;
import org.deeplearning4j.nn.conf.dropout.Dropout;
import org.deeplearning4j.nn.conf.dropout.IDropout;
import org.deeplearning4j.nn.conf.inputs.InputType;
import org.deeplearning4j.nn.conf.layers.*;
import org.deeplearning4j.nn.layers.convolution.ConvolutionHelper;
import org.deeplearning4j.nn.layers.convolution.CudnnConvolutionHelper;
import org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingHelper;
import org.deeplearning4j.nn.layers.dropout.CudnnDropoutHelper;
import org.deeplearning4j.nn.layers.normalization.BatchNormalizationHelper;
import org.deeplearning4j.nn.layers.normalization.CudnnBatchNormalizationHelper;
import org.deeplearning4j.nn.layers.normalization.CudnnLocalResponseNormalizationHelper;
Expand All @@ -26,17 +32,20 @@
import org.nd4j.linalg.api.buffer.util.DataTypeUtil;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.function.Consumer;
import org.nd4j.linalg.learning.config.NoOp;
import org.nd4j.linalg.lossfunctions.LossFunctions;

import java.lang.reflect.Field;
import java.util.Random;

import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

/**
* Created by Alex on 09/09/2016.
*/
@Slf4j
public class CuDNNGradientChecks extends BaseDL4JTest {

private static final boolean PRINT_RESULTS = true;
Expand Down Expand Up @@ -545,4 +554,81 @@ public void testCnnDilated() throws Exception {
}
}
}


@Test
public void testDropout() {
int minibatch = 3;

for (boolean cnn : new boolean[]{false, true}) {
Nd4j.getRandom().setSeed(12345);
IDropout dropout = new Dropout(0.6);

NeuralNetConfiguration.ListBuilder builder = new NeuralNetConfiguration.Builder()
.seed(12345)
.weightInit(WeightInit.DISTRIBUTION)
.dist(new NormalDistribution(0, 1))
.convolutionMode(ConvolutionMode.Same)
.dropOut(dropout)
.activation(Activation.TANH)
.updater(new NoOp())
.list();

if (cnn) {
builder.layer(new ConvolutionLayer.Builder().kernelSize(3, 3).stride(1, 1).nOut(3).build());
builder.layer(new ConvolutionLayer.Builder().kernelSize(3, 3).stride(1, 1).nOut(3).build());
builder.setInputType(InputType.convolutional(8, 8, 3));
} else {
builder.layer(new DenseLayer.Builder().nOut(12).build());
builder.layer(new DenseLayer.Builder().nOut(12).build());
builder.setInputType(InputType.feedForward(8));
}
builder.layer(new OutputLayer.Builder().nOut(10).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build());
MultiLayerConfiguration conf = builder.build();

MultiLayerNetwork mln = new MultiLayerNetwork(conf);
mln.init();

for (Layer l : mln.getLayers()) {
Dropout d = (Dropout) l.conf().getLayer().getIDropout();
assertNotNull(d);
CudnnDropoutHelper h = (CudnnDropoutHelper) d.getHelper();
assertNotNull(h);
}

String msg = (cnn ? "CNN" : "Dense") + ": " + dropout.getClass().getSimpleName();

INDArray f;
if (cnn) {
f = Nd4j.rand(new int[]{minibatch, 3, 8, 8}).muli(10).subi(5);
} else {
f = Nd4j.rand(minibatch, 8).muli(10).subi(5);
}
INDArray l = TestUtils.randomOneHot(minibatch, 10);

//Consumer function to enforce CuDNN RNG repeatability - otherwise will fail due to randomness (inconsistent
// dropout mask between forward passes)
Consumer<MultiLayerNetwork> c = new Consumer<MultiLayerNetwork>() {
@Override
public void accept(MultiLayerNetwork net) {
Nd4j.getRandom().setSeed(12345);
for(Layer l : net.getLayers()){
Dropout d = (Dropout) l.conf().getLayer().getIDropout();
if(d != null){
((CudnnDropoutHelper)d.getHelper()).setMask(null);
((CudnnDropoutHelper)d.getHelper()).setRngStates(null);
}
}
}
};

log.info("*** Starting test: " + msg + " ***");
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, f, l, null, null,
false, -1, null, c);

assertTrue(msg, gradOK);
TestUtils.testModelSerialization(mln);
}
}
}
Loading