RNN: fix bug for parameter gradient in a case when SumOp is

Summary: Issue is that AliasOp doesn't work well with swaps that we do for param.grad and param.accGrad. Tensors become the same if there is no reallocation of the gradient tensor inside the backward cell net's local workspace. bug explanation from akyrola: ``` gpu_0/decoder/decoder_hidden_encoder_outputs_sum_grad: tensor A on each timestap back to 0, we Alias gpu_0/decoder/weighted_encoder_outputs_grad, so then also gpu_0/decoder/weighted_encoder_outputs_grad: tensor A It's acc is: gpu_0/decoder/weighted_encoder_outputs_grad_acc: tensor B Now after timesteps, we swap (line 626) with _acc to get gpu_0/decoder/weighted_encoder_outputs_grad: tensor B gpu_0/decoder/weighted_encoder_outputs_grad_acc: tensor A OPTION A -- batch size is same as before or smaller: Then on next iteration, we do again the Alias to gpu_0/decoder/decoder_hidden_encoder_outputs_sum_grad, so now gpu_0/decoder/weighted_encoder_outputs_grad: tensor A and also gpu_0/decoder/weighted_encoder_outputs_grad_acc: tensor A swapping them does nothing and they are the same OPTION B -- batch size increases gpu_0/decoder/decoder_hidden_encoder_outputs_sum_grad is reallocated, becomes tensor C gpu_0/decoder/weighted_encoder_outputs_grad becomes tensor C with Alias gpu_0/decoder/weighted_encoder_outputs_grad_acc: is tensor A ``` Reviewed By: urikz Differential Revision: D4946730 Tags: rnn, caffe2 fbshipit-source-id: b52d63cb238b81d2ad40e05e70deb32a81336f47
facebookarchive · Apr 26, 2017 · 8e01e4c · 8e01e4c
1 parent 7c10fe2
commit 8e01e4c
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 19 deletions.
diff --git a/caffe2/operators/recurrent_network_op.h b/caffe2/operators/recurrent_network_op.h
@@ -14,6 +14,7 @@ struct Param {
   std::string param;
   std::string grad;
   std::string accGrad;
+  std::string cellGradient;
 };
 
 struct RecurrentInput {
@@ -370,12 +371,20 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
   std::vector<detail::Param> constructParams() {
     std::vector<detail::Param> params;
     const auto& param = OperatorBase::GetRepeatedArgument<int32_t>("param");
+    const auto& param_grads =
+        OperatorBase::GetRepeatedArgument<string>("param_grads");
+    CAFFE_ENFORCE(
+        param_grads.empty() || param_grads.size() == param.size(),
+        param.size(),
+        " != ",
+        param_grads.size());
     for (int i = 0; i < param.size(); ++i) {
       detail::Param p;
       // Forward inputs come after [outputs_with_grads] gradient inputs
       p.param = def().input(param[i] + gradInputs_.size());
       // See GetRecurrentNetworkGradient to understand offseting here
       p.grad = def().output(i + numSequences_);
+      p.cellGradient = param_grads.empty() ? "" : param_grads[i];
       p.accGrad = p.grad + "_acc";
       params.push_back(p);
     }
@@ -509,7 +518,14 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
     auto accumulateParameterGradients = [&]() {
       for (const auto& param : params_) {
-        auto gBlob = sharedWs_->GetBlob(param.grad);
+        // If a user passes in param_grads mapping, we can copy dirrectly
+        // form a blob where backward cell net written data to.
+        // This becomes handy in a case where gradient from the cell net
+        // is an internal blob of the backward cell. This happens, for example,
+        // when SumOp is the first op of the cell
+        auto gBlob = param.cellGradient.empty()
+            ? sharedWs_->GetBlob(param.grad)
+            : localWs_.GetBlob(param.cellGradient);
         CAFFE_ENFORCE(gBlob);
         const auto& g = gBlob->template Get<Tensor<Context>>();
 

diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py
@@ -184,23 +184,6 @@ def s(name):
         forward_links.append((str(input_t), str(input_blob), 0))
 
     if backward_cell_net is not None:
-        for reference in references:
-            # Similar to above, in a case of a SumOp we need to write our parameter
-            # gradient to an external blob. In this case we can be sure that
-            # reference + "_grad" is a correct parameter name as we know how
-            # RecurrentNetworkOp gradient schema looks like.
-            reference_grad = reference + "_grad"
-            if (reference in backward_mapping and
-                    reference_grad != str(backward_mapping[reference])):
-                # We can use an Alias because after each timestep
-                # RNN op adds value from reference_grad into and _acc blob
-                # which accumulates gradients for corresponding parameter accross
-                # timesteps. Then in the end of RNN op these two are being
-                # swaped and reference_grad blob becomes a real blob instead of
-                # being an alias
-                backward_cell_net.Alias(
-                    backward_mapping[reference], reference_grad)
-
         for input_t, input_blob in inputs:
             backward_links.append((
                 backward_mapping[str(input_t)], str(input_blob) + "_grad", 0
@@ -228,14 +211,19 @@ def unpack_triple(x):
         backward_link_internal, backward_link_external, backward_link_offset = \
             unpack_triple(backward_links)
         params = [x for x in references if x in backward_mapping.keys()]
+        param_grads = [str(backward_mapping[x])
+                       for x in references
+                       if x in backward_mapping.keys()]
         backward_args = {
             'param': map(all_inputs.index, params),
             'backward_link_internal': map(str, backward_link_internal),
             'backward_link_external': map(str, backward_link_external),
             'backward_link_offset': backward_link_offset,
             'backward_step_net': str(backward_cell_net.Proto()),
             'outputs_with_grads': outputs_with_grads,
-            'recompute_blobs_on_backward': map(str, recompute_blobs_on_backward)
+            'recompute_blobs_on_backward': map(
+                str, recompute_blobs_on_backward),
+            'param_grads': param_grads,
         }
 
     results = net.RecurrentNetwork(