chainer · okuta · Oct 14, 2017 · Aug 24, 2017 · Aug 27, 2017 · Sep 4, 2017
diff --git a/chainer/cuda.py b/chainer/cuda.py
@@ -490,3 +490,19 @@ def set_max_workspace_size(size):
     """
     global _max_workspace_size
     _max_workspace_size = size
+
+
+def fuse(*args, **kwargs):
+    """Function fusing decorator.
+
+    It calls :func:`cupy.fuse` when CuPy is available to make fused function
+    and does nothing otherwise.
+
+    .. seealso::
+       :func:`cupy.fuse`
+
+    """
+    if available:
+        return cupy.fuse(*args, **kwargs)
+    else:
+        return lambda f: f
diff --git a/chainer/functions/activation/lstm.py b/chainer/functions/activation/lstm.py
@@ -180,13 +180,18 @@ def backward(self, inputs, grads):
         c_prev, x, c, gc, gh = inputs
         ggc_prev, ggx = grads
 
-        gc_prev = xp.zeros_like(c_prev)
-        gx = xp.zeros_like(x)
-        gc_next = xp.zeros_like(c)
-        ggc = ggc_prev.copy()
-        ggh = xp.zeros_like(gh)
+        gc_prev = xp.empty_like(c_prev)
+        gx = xp.empty_like(x)
+        gc_next = xp.empty_like(c)
+        ggc = xp.empty_like(ggc_prev)
+        ggh = xp.empty_like(gh)
 
         batch = len(x)
+        gc_prev[batch:] = 0
+        gc_next[batch:] = 0
+        ggc[batch:] = ggc_prev[batch:]
+        ggh[batch:] = 0
+
         c_prev = c_prev[:batch]
         c = c[:batch]
         gc = gc[:batch]
@@ -195,51 +200,63 @@ def backward(self, inputs, grads):
 
         a, i, f, o = _extract_gates(x)
         gga, ggi, ggf, ggo = _extract_gates(ggx)
-
         ga, gi, gf, go = _extract_gates(gx)
 
-        sig_o = _sigmoid(o, xp)
-        gsig_o = _grad_sigmoid(sig_o)
-        ggsig_o = _grad_grad_sigmoid(sig_o)
-        sig_i = _sigmoid(i, xp)
-        gsig_i = _grad_sigmoid(sig_i)
-        ggsig_i = _grad_grad_sigmoid(sig_i)
-        sig_f = _sigmoid(f, xp)
-        gsig_f = _grad_sigmoid(sig_f)
-        ggsig_f = _grad_grad_sigmoid(sig_f)
-        tanh_a = xp.tanh(a)
-        gtanh_a = _grad_tanh(tanh_a)
-        ggtanh_a = _grad_grad_tanh(tanh_a, gtanh_a)
-        tanh_c = xp.tanh(c)
-        gtanh_c = _grad_tanh(tanh_c)
-        ggtanh_c = _grad_grad_tanh(tanh_c, gtanh_c)
-
-        gc_bar = gh * sig_o * gtanh_c + gc
-
-        gc_prev[:batch] = ggf * gc_bar * gsig_f
-        ga[:] = (gga * sig_i * ggtanh_a +
-                 ggi * gtanh_a * gsig_i) * gc_bar
-        gi[:] = (gga * gtanh_a * gsig_i +
-                 ggi * tanh_a * ggsig_i) * gc_bar
-        gf[:] = (ggc_prev * (gh * sig_o * gtanh_c + gc) * gsig_f +
-                 ggf * gc_bar * c_prev[:batch] * ggsig_f)
-
-        ggc_this = (
-            ggc_prev * sig_f +
-            gga * sig_i * gtanh_a +
-            ggi * tanh_a * gsig_i +
-            ggf * c_prev[:batch] * gsig_f)
-        ggc[:batch] = ggc_this
-
-        dgc_do = gh * gsig_o * gtanh_c
-        go[:] = ggc_this * dgc_do + ggo * gh * tanh_c * ggsig_o
-        dgc_dc = gh * sig_o * ggtanh_c
-        gc_next[:batch] = ggc_this * dgc_dc + ggo * gh * gtanh_c * gsig_o
-        ggh[:batch] = ggc_this * sig_o * gtanh_c + ggo * tanh_c * gsig_o
-
+        gc_prev[:batch], ga[:], gi[:], gf[:], go[:], gc_next[:batch], \
+            ggc[:batch], ggh[:batch] \
+            = lstm_grad_grad(
+                c_prev, a, i, f, o, c, gc, gh, ggc_prev, gga, ggi, ggf, ggo)
         return gc_prev, gx, gc_next, ggc, ggh
 
 
+def _cupy_sigmoid(x):
+    half = x.dtype.type(0.5)
+    return cuda.fusion.tanh(x * half) * half + half
+
+
+@cuda.fuse()
+def lstm_grad_grad(
+        c_prev, a, i, f, o, c, gc, gh, ggc_prev, gga, ggi, ggf, ggo):
+    sig_o = _cupy_sigmoid(o)
+    gsig_o = _grad_sigmoid(sig_o)
+    ggsig_o = _grad_grad_sigmoid(sig_o)
+    sig_i = _cupy_sigmoid(i)
+    gsig_i = _grad_sigmoid(sig_i)
+    ggsig_i = _grad_grad_sigmoid(sig_i)
+    sig_f = _cupy_sigmoid(f)
+    gsig_f = _grad_sigmoid(sig_f)
+    ggsig_f = _grad_grad_sigmoid(sig_f)
+    tanh_a = cuda.fusion.tanh(a)
+    gtanh_a = _grad_tanh(tanh_a)
+    ggtanh_a = _grad_grad_tanh(tanh_a, gtanh_a)
+    tanh_c = cuda.fusion.tanh(c)
+    gtanh_c = _grad_tanh(tanh_c)
+    ggtanh_c = _grad_grad_tanh(tanh_c, gtanh_c)
+
+    gc_bar = gh * sig_o * gtanh_c + gc
+
+    gc_prev = ggf * gc_bar * gsig_f
+    ga = (gga * sig_i * ggtanh_a +
+          ggi * gtanh_a * gsig_i) * gc_bar
+    gi = (gga * gtanh_a * gsig_i +
+          ggi * tanh_a * ggsig_i) * gc_bar
+    gf = (ggc_prev * (gh * sig_o * gtanh_c + gc) * gsig_f +
+          ggf * gc_bar * c_prev * ggsig_f)
+
+    ggc = (
+        ggc_prev * sig_f +
+        gga * sig_i * gtanh_a +
+        ggi * tanh_a * gsig_i +
+        ggf * c_prev * gsig_f)
+
+    dgc_do = gh * gsig_o * gtanh_c
+    go = ggc * dgc_do + ggo * gh * tanh_c * ggsig_o
+    dgc_dc = gh * sig_o * ggtanh_c
+    gc_next = ggc * dgc_dc + ggo * gh * gtanh_c * gsig_o
+    ggh = ggc * sig_o * gtanh_c + ggo * tanh_c * gsig_o
+    return gc_prev, ga, gi, gf, go, gc_next, ggc, ggh
+
+
 def lstm(c_prev, x):
     """Long Short-Term Memory units as an activation function.