diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 6d035a8637bf..000000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,164 +0,0 @@
-version: 2
-jobs:
-  build:
-    docker:
-      - image: ubuntu:xenial
-    working_directory: ~/work
-    steps:
-
-      - run:
-          name: Preparation
-          command: |
-            echo 'export WORK_DIR=~/work' >> $BASH_ENV
-            echo 'export REPO_DIR=~/repo' >> $BASH_ENV
-            echo 'export RUN_STEP="bash $REPO_DIR/chainerx_cc/scripts/ci/run-step.sh"' >> $BASH_ENV
-            echo 'export CHAINERX_DIR="$REPO_DIR"/chainerx_cc' >> $BASH_ENV
-
-            # To avoid warning on checkout
-            apt-get update
-            apt-get install -y git openssh-client
-
-      # Checkout
-      - checkout:
-          path: ~/repo
-
-      # Setup
-      - run:
-          name: Setup
-          command: |
-            $RUN_STEP setup
-
-      # Checkout postprocess
-      - run:
-          name: Checkout postprocess
-          command: |
-            pushd "$REPO_DIR"
-
-            if [[ -n "${CIRCLE_PULL_REQUEST}" ]]
-            then
-                # CIRCLE_PR_NUMBER is available only if PR is created from a fork (unavailable if created from a branch).
-                # So, manually construct it from CIRCLE_PULL_REQUEST environment variable.
-                CIRCLE_PR_NUMBER=$(basename "${CIRCLE_PULL_REQUEST}")
-
-                # Update PR refs for testing.
-                FETCH_REFS="+master:master"
-                FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head"
-                FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge"
-
-                # Retrieve the refs
-                echo "git fetch -u origin ${FETCH_REFS}"
-                git fetch -u origin ${FETCH_REFS}
-
-                # Checkout master and merge PR head ref. If conflicts occur, exit with non-zero.
-                echo "git checkout master"
-                git checkout master
-                git config user.name 'chainerx' # need to configure something to git merge
-                git config user.email 'chainerx@chainerx.org'
-                echo "git merge --no-commit \"pr/${CIRCLE_PR_NUMBER}/head\""
-                git merge --no-commit "pr/${CIRCLE_PR_NUMBER}/head"
-            fi
-
-            popd
-
-      # Setup conda
-      - run:
-          name: Setup conda
-          command: |
-            $RUN_STEP setup_conda
-
-      # Environment info
-      - run:
-          name: Environment info
-          command: |
-            $RUN_STEP show_environment_info
-
-      # Setup conda environment
-      - run:
-          name: Setup conda environment
-          command: |
-            $RUN_STEP setup_conda_environment
-
-      # Python PEP-8 check
-      - run:
-          name: Python PEP-8 check
-          command: |
-            $RUN_STEP python_style_check
-
-      # ClangFormat
-      - run:
-          name: ClangFormat
-          command: |
-            $RUN_STEP clang_format
-
-      # cpplint
-      - run:
-          name: cpplint
-          command: |
-            $RUN_STEP cpplint
-
-      # Setup openblas
-      - run:
-          name: Setup openblas
-          command: |
-            $RUN_STEP setup_openblas
-
-      # cmake
-      - run:
-          name: cmake
-          command: |
-            $RUN_STEP cmake
-
-      # clang-tidy (normal source files)
-      # NOTE: We runs this only on master branch because it takes much time.
-      - run:
-          name: clang-tidy (normal source files)
-          command: |
-            if [[ "$CIRCLE_BRANCH" = "master" ]]; then
-              $RUN_STEP clang_tidy normal
-            fi
-
-      # clang-tidy (test source files)
-      # NOTE: We runs this only on master branch because it takes much time.
-      - run:
-          name: clang-tidy (test source files)
-          command: |
-            if [[ "$CIRCLE_BRANCH" = "master" ]]; then
-              $RUN_STEP clang_tidy test
-            fi
-
-      # make
-      - run:
-          name: make
-          command: |
-            MAKEFLAGS=-j2 $RUN_STEP make
-
-      # make install
-      - run:
-          name: make install
-          command: |
-            $RUN_STEP make_install
-
-      # C++ test
-      - run:
-          name: C++ test
-          command: |
-            $RUN_STEP ctest
-
-      # Python build
-      - run:
-          name: Python build
-          command: |
-            MAKEFLAGS=-j2 $RUN_STEP python_build
-
-      # Python test
-      - run:
-          name: Python test
-          command: |
-            $RUN_STEP python_test_chainerx_nocuda
-
-experimental:
-  notify:
-    branches:
-      only:
-        - nightly
-        - master
diff --git a/.gitignore b/.gitignore
index d135c878c091..1e1c96be4b6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,6 @@ build
 .eggs/
 _readthedocs_build
 /TAGS
-/docs/source/reference/**/generated
 /tags
 chainer.egg-info/
 dist/
@@ -25,6 +24,4 @@ docs/my.state
 docs/my_mnist.model
 docs/mnist_result
 docs/*.png
-docs/source/reference/core
-docs/source/reference/generated
-docs/source/reference/util
+/docs/source/**/reference/**/generated
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000000..293d17ce0655
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,7 @@
+# Chainer Code of Conduct
+
+Chainer follows the [NumFOCUS Code of Conduct][homepage] available at https://numfocus.org/code-of-conduct.
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at chainer@preferred.jp. 
+
+[homepage]: https://numfocus.org/
diff --git a/chainer/backends/cuda.py b/chainer/backends/cuda.py
index 4eb918d6f5ba..93244bb6f0f3 100644
--- a/chainer/backends/cuda.py
+++ b/chainer/backends/cuda.py
@@ -312,7 +312,7 @@ def get_device(*args):
 
     .. note::
 
-        This API is deprecated. Please use
+        This API is deprecated since v3.0.0. Please use
         :func:`~chainer.backends.cuda.get_device_from_id`
         or :func:`~chainer.backends.cuda.get_device_from_array` instead.
 
@@ -716,6 +716,8 @@ def fuse(*args, **kwargs):
     """
     if available:
         return cupy.fuse(*args, **kwargs)
+    elif len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+        return args[0]
     else:
         return lambda f: f
 
diff --git a/chainer/dataset/convert.py b/chainer/dataset/convert.py
index 50358ac057ae..1b0404a9237d 100644
--- a/chainer/dataset/convert.py
+++ b/chainer/dataset/convert.py
@@ -6,7 +6,6 @@
 import chainer
 from chainer import backend
 from chainer.backends import cuda
-from chainer import utils
 
 
 def to_device(device, x):
@@ -192,7 +191,7 @@ def _concat_arrays_with_padding(arrays, padding):
         for i in six.moves.range(len(arrays)):
             src = arrays[i]
             slices = tuple(slice(dim) for dim in src.shape)
-            utils._setitem(result, (i,) + slices, src)
+            result[(i,) + slices] = src
 
     return result
 
diff --git a/chainer/distributions/__init__.py b/chainer/distributions/__init__.py
index b25263099aa9..db73f2bd2953 100644
--- a/chainer/distributions/__init__.py
+++ b/chainer/distributions/__init__.py
@@ -3,10 +3,12 @@
 from chainer.distributions.bernoulli import Bernoulli  # NOQA
 from chainer.distributions.beta import Beta  # NOQA
 from chainer.distributions.categorical import Categorical  # NOQA
+from chainer.distributions.cauchy import Cauchy  # NOQA
 from chainer.distributions.chisquare import Chisquare  # NOQA
 from chainer.distributions.dirichlet import Dirichlet  # NOQA
 from chainer.distributions.exponential import Exponential  # NOQA
 from chainer.distributions.gamma import Gamma  # NOQA
+from chainer.distributions.geometric import Geometric  # NOQA
 from chainer.distributions.gumbel import Gumbel  # NOQA
 from chainer.distributions.laplace import Laplace  # NOQA
 from chainer.distributions.log_normal import LogNormal  # NOQA
diff --git a/chainer/distributions/cauchy.py b/chainer/distributions/cauchy.py
new file mode 100644
index 000000000000..c9493025ae30
--- /dev/null
+++ b/chainer/distributions/cauchy.py
@@ -0,0 +1,98 @@
+import warnings
+
+import numpy
+
+import chainer
+from chainer.backends import cuda
+from chainer import distribution
+from chainer.functions.math import exponential
+from chainer.functions.math import trigonometric
+
+
+def _cauchy_icdf(x):
+    x = chainer.as_variable(x)
+    h = (x - 0.5) * numpy.pi
+    y = chainer.functions.tan(h)
+    return y
+
+
+class Cauchy(distribution.Distribution):
+
+    """Cauchy Distribution.
+
+    The probability density function of the distribution is expressed as
+
+    .. math::
+        p(x;x_0,\\gamma) = \\frac{1}{\\pi}\\frac{\\gamma}{(x-x_0)^2+\\gamma^2}
+
+    Args:
+        loc(:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
+        :class:`cupy.ndarray`): Parameter of distribution representing the \
+        location :math:`\\x_0`.
+        scale(:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
+        :class:`cupy.ndarray`): Parameter of distribution representing the \
+        scale :math:`\\gamma`.
+    """
+
+    def __init__(self, loc, scale):
+        super(Cauchy, self).__init__()
+        self.loc = chainer.as_variable(loc)
+        self.scale = chainer.as_variable(scale)
+
+    @property
+    def batch_shape(self):
+        return self.loc.shape
+
+    def cdf(self, x):
+        return 1 / numpy.pi * trigonometric.arctan(
+            (x - self.loc) / self.scale) + 0.5
+
+    @property
+    def entropy(self):
+        return exponential.log(4 * numpy.pi * self.scale)
+
+    @property
+    def event_shape(self):
+        return ()
+
+    def icdf(self, x):
+        return self.loc + self.scale * _cauchy_icdf(x)
+
+    @property
+    def _is_gpu(self):
+        return isinstance(self.loc.data, cuda.ndarray)
+
+    def log_prob(self, x):
+        return - numpy.log(numpy.pi) + exponential.log(self.scale) \
+            - exponential.log((x - self.loc)**2 + self.scale**2)
+
+    @property
+    def mean(self):
+        warnings.warn("Mean of the cauchy distribution is undefined.",
+                      RuntimeWarning)
+        xp = cuda.get_array_module(self.loc)
+        return chainer.as_variable(xp.full_like(self.loc.data, xp.nan))
+
+    def sample_n(self, n):
+        xp = cuda.get_array_module(self.loc)
+        if xp is cuda.cupy:
+            eps = xp.random.standard_cauchy(
+                (n,)+self.loc.shape, dtype=self.loc.dtype)
+        else:
+            eps = xp.random.standard_cauchy(
+                (n,)+self.loc.shape).astype(self.loc.dtype)
+
+        noise = self.scale * eps + self.loc
+
+        return noise
+
+    @property
+    def support(self):
+        return 'real'
+
+    @property
+    def variance(self):
+        warnings.warn("Variance of the cauchy distribution is undefined.",
+                      RuntimeWarning)
+        xp = cuda.get_array_module(self.loc)
+        return chainer.as_variable(xp.full_like(self.loc.data, xp.nan))
diff --git a/chainer/distributions/geometric.py b/chainer/distributions/geometric.py
new file mode 100644
index 000000000000..481006070498
--- /dev/null
+++ b/chainer/distributions/geometric.py
@@ -0,0 +1,74 @@
+import chainer
+from chainer.backends import cuda
+from chainer import distribution
+from chainer.functions.math import exponential
+
+
+class Geometric(distribution.Distribution):
+
+    """Geometric Distribution.
+
+    The probability mass function of the distribution is expressed as
+
+    .. math::
+        Pr(x = k) = p(1-p)^{k-1},
+        for k = 1, 2, 3, ...,
+
+    Args:
+        p(:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
+        :class:`cupy.ndarray`): Parameter of distribution.
+    """
+
+    def __init__(self, p):
+        super(Geometric, self).__init__()
+        self.__p = chainer.as_variable(p)
+
+    @property
+    def p(self):
+        return self.__p
+
+    @property
+    def batch_shape(self):
+        return self.p.shape
+
+    @property
+    def event_shape(self):
+        return ()
+
+    @property
+    def _is_gpu(self):
+        return isinstance(self.p.data, cuda.ndarray)
+
+    def log_prob(self, x):
+        return (x - 1) * exponential.log(1 - self.p) + exponential.log(self.p)
+
+    @property
+    def mean(self):
+        return 1 / self.p
+
+    def sample_n(self, n):
+        xp = cuda.get_array_module(self.p)
+        if xp is cuda.cupy:
+            eps = xp.random.geometric(
+                self.p.data,
+                size=(n,)+self.batch_shape, dtype=self.p.dtype)
+        else:
+            eps = xp.random.geometric(
+                self.p.data,
+                size=(n,)+self.batch_shape).astype(self.p.dtype)
+        return chainer.Variable(eps)
+
+    @property
+    def support(self):
+        return 'positive integer'
+
+    @property
+    def variance(self):
+        return (1 - self.p) / self.p ** 2
+
+
+@distribution.register_kl(Geometric, Geometric)
+def _kl_geometric_geometric(dist1, dist2):
+    return (1 / dist1.p - 1) \
+        * (exponential.log(1 - dist1.p) - exponential.log(1 - dist2.p)) \
+        + exponential.log(dist1.p) - exponential.log(dist2.p)
diff --git a/chainer/function.py b/chainer/function.py
index 9f38812d3170..4811bc627d22 100644
--- a/chainer/function.py
+++ b/chainer/function.py
@@ -161,10 +161,8 @@ def backward(self, target_input_indexes, grad_outputs):
         with cuda.get_device_from_array(*(in_data + grad_out_data)):
             gxs = self._function.backward(in_data, grad_out_data)
 
-        for x, gx in six.moves.zip(inputs, gxs):
-            if x is None:
-                continue
-            variable._check_grad_type(self, x, gx)
+        for x, gx in six.moves.zip(self.inputs, gxs):
+            variable._check_grad_type(self, x, True, gx, False)
 
         # Convert input gradients back to ChainerX
         if xp is chainerx:
diff --git a/chainer/functions/activation/crelu.py b/chainer/functions/activation/crelu.py
index faa22407b490..ace38ee9d5cc 100644
--- a/chainer/functions/activation/crelu.py
+++ b/chainer/functions/activation/crelu.py
@@ -57,8 +57,7 @@ def crelu(x, axis=1):
     See: https://arxiv.org/abs/1603.05201
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
         axis (int): Axis that the output values are concatenated along.
             Default is 1.
diff --git a/chainer/functions/activation/elu.py b/chainer/functions/activation/elu.py
index 1eb329769a40..17242ca1ee84 100644
--- a/chainer/functions/activation/elu.py
+++ b/chainer/functions/activation/elu.py
@@ -92,8 +92,7 @@ def elu(x, alpha=1.0):
     See: https://arxiv.org/abs/1511.07289
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
         alpha (float): Parameter :math:`\\alpha`. Default is 1.0.
 
diff --git a/chainer/functions/activation/hard_sigmoid.py b/chainer/functions/activation/hard_sigmoid.py
index 1eedd87a0921..8c139f064407 100644
--- a/chainer/functions/activation/hard_sigmoid.py
+++ b/chainer/functions/activation/hard_sigmoid.py
@@ -82,8 +82,7 @@ def hard_sigmoid(x):
         \\end{array} \\right.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
 
     Returns:
diff --git a/chainer/functions/activation/leaky_relu.py b/chainer/functions/activation/leaky_relu.py
index e69aceb34307..f71dc67f643b 100644
--- a/chainer/functions/activation/leaky_relu.py
+++ b/chainer/functions/activation/leaky_relu.py
@@ -132,8 +132,7 @@ def leaky_relu(x, slope=0.2):
     where :math:`a` is a configurable slope value.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
         slope (float): Slope value :math:`a`.
 
diff --git a/chainer/functions/activation/log_softmax.py b/chainer/functions/activation/log_softmax.py
index c52a58a257be..a77d0d25c18c 100644
--- a/chainer/functions/activation/log_softmax.py
+++ b/chainer/functions/activation/log_softmax.py
@@ -124,8 +124,7 @@ def log_softmax(x, axis=1):
         ``log_softmax`` method is more stable.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable.
             A :math:`n`-dimensional (:math:`n \\geq 2`) float array.
         axis (int): The axis along which the softmax is to be computed.
diff --git a/chainer/functions/activation/lstm.py b/chainer/functions/activation/lstm.py
index e239173ecea4..9611d53a682b 100644
--- a/chainer/functions/activation/lstm.py
+++ b/chainer/functions/activation/lstm.py
@@ -305,12 +305,10 @@ def lstm(c_prev, x):
     applying the function.
 
     Args:
-        c_prev (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        c_prev (:class:`~chainer.Variable` or :ref:`ndarray`):
             Variable that holds the previous cell state. The cell state
             should be a zero array or the output of the previous call of LSTM.
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Variable that holds the sources of cell input, input gate, forget
             gate and output gate. It must have the second dimension whose size
             is four times of that of the cell state.
diff --git a/chainer/functions/activation/maxout.py b/chainer/functions/activation/maxout.py
index 7c2199b35b95..bb34b531794d 100644
--- a/chainer/functions/activation/maxout.py
+++ b/chainer/functions/activation/maxout.py
@@ -11,8 +11,7 @@ def maxout(x, pool_size, axis=1):
     ``(M, pool_size)``, and takes maximum along the ``axis`` dimension.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`n`-dimensional (:math:`n \\ge` ``axis``)
             float array. In general, its first dimension is assumed to be the
             *minibatch dimension*. The other dimensions are treated as one
diff --git a/chainer/functions/activation/prelu.py b/chainer/functions/activation/prelu.py
index cebda5cb62d8..3b731f68546b 100644
--- a/chainer/functions/activation/prelu.py
+++ b/chainer/functions/activation/prelu.py
@@ -162,9 +162,9 @@ def prelu(x, W):
     :math:`N` is an arbitrary non-negative integer.
 
     Args:
-        x (~chainer.Variable): Input variable.
+        x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable.
             Its first argument is assumed to be the minibatch dimension.
-        W (~chainer.Variable): Weight variable.
+        W (:class:`~chainer.Variable` or :ref:`ndarray`): Weight variable.
 
     Returns:
         ~chainer.Variable: Output variable
diff --git a/chainer/functions/activation/relu.py b/chainer/functions/activation/relu.py
index c3f4f9678687..012677f26491 100644
--- a/chainer/functions/activation/relu.py
+++ b/chainer/functions/activation/relu.py
@@ -162,8 +162,7 @@ def relu(x):
     .. math:: f(x)=\\max(0, x).
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
 
     Returns:
diff --git a/chainer/functions/activation/rrelu.py b/chainer/functions/activation/rrelu.py
index ca7200749e46..bbc47780b67b 100644
--- a/chainer/functions/activation/rrelu.py
+++ b/chainer/functions/activation/rrelu.py
@@ -100,8 +100,7 @@ def rrelu(x, l=1. / 8, u=1. / 3, **kwargs):
     See: https://arxiv.org/pdf/1505.00853.pdf
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
         l (float): The lower bound of the uniform distribution.
         u (float): The upper bound of the uniform distribution.
diff --git a/chainer/functions/activation/selu.py b/chainer/functions/activation/selu.py
index f4a152b36f31..3e3cd72a53fb 100644
--- a/chainer/functions/activation/selu.py
+++ b/chainer/functions/activation/selu.py
@@ -17,8 +17,7 @@ def selu(x,
     See: https://arxiv.org/abs/1706.02515
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
         alpha (float): Parameter :math:`\\alpha`.
         scale (float): Parameter :math:`\\lambda`.
diff --git a/chainer/functions/activation/sigmoid.py b/chainer/functions/activation/sigmoid.py
index 0c58f1b5515a..4910e5772005 100644
--- a/chainer/functions/activation/sigmoid.py
+++ b/chainer/functions/activation/sigmoid.py
@@ -96,8 +96,7 @@ def sigmoid(x):
      .. math:: f(x)=(1 + \\exp(-x))^{-1}.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
 
     Returns:
diff --git a/chainer/functions/activation/slstm.py b/chainer/functions/activation/slstm.py
index ea5e0b1303ec..8509233a8113 100644
--- a/chainer/functions/activation/slstm.py
+++ b/chainer/functions/activation/slstm.py
@@ -359,23 +359,19 @@ def slstm(c_prev1, c_prev2, x1, x2):
     The function returns ``c`` and ``h`` as a tuple.
 
     Args:
-        c_prev1 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        c_prev1 (:class:`~chainer.Variable` or :ref:`ndarray`):
             Variable that holds the previous cell state of the first child
             node. The cell state should be a zero array or the output of
             the previous call of LSTM.
-        c_prev2 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        c_prev2 (:class:`~chainer.Variable` or :ref:`ndarray`):
             Variable that holds the previous cell state of the second child
             node.
-        x1 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x1 (:class:`~chainer.Variable` or :ref:`ndarray`):
             Variable that holds the sources of cell input, input gate, forget
             gate and output gate from the first child node. It must have the
             second dimension whose size is four times of that of the cell
             state.
-        x2 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x2 (:class:`~chainer.Variable` or :ref:`ndarray`):
             Variable that holds the input sources from the second child node.
 
     Returns:
diff --git a/chainer/functions/activation/softmax.py b/chainer/functions/activation/softmax.py
index 5fc2371cf51f..47784b3aa108 100644
--- a/chainer/functions/activation/softmax.py
+++ b/chainer/functions/activation/softmax.py
@@ -90,8 +90,7 @@ def softmax(x, axis=1):
     defined as :math:`f(c)={\\exp(c) \\over \\sum_{d} \\exp(c_d)}`.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable.
             A :math:`n`-dimensional (:math:`n \\geq 2`) float array.
         axis (int): The axis along which the softmax is to be computed.
diff --git a/chainer/functions/activation/softplus.py b/chainer/functions/activation/softplus.py
index ed7cf538680e..d8d31a7f1d41 100644
--- a/chainer/functions/activation/softplus.py
+++ b/chainer/functions/activation/softplus.py
@@ -97,8 +97,7 @@ def softplus(x, beta=1.0):
     and akin to ReLU as the :math:`\\beta` is increasing.
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
         beta (float): Parameter :math:`\\beta`.
 
diff --git a/chainer/functions/activation/swish.py b/chainer/functions/activation/swish.py
index e79f762c9cd8..737ee618d408 100644
--- a/chainer/functions/activation/swish.py
+++ b/chainer/functions/activation/swish.py
@@ -165,14 +165,15 @@ def swish(x, beta):
         \\lim_{\\beta \\to \\infty} f(x, \\beta) &= \\max(0, x).
 
     Args:
-        x (~chainer.Variable): Input variable of shape :math:`(s_B, s_1, \
-            s_2, ..., s_N)`, where :math:`s_B` is assumed to be the
-            *minibatch dimension*.
-        beta (~chainer.Variable): Parameter variable :math:`\\beta` of shape
-            :math:`(s_1, s_2, ..., s_M)`, where :math:`M` is an arbitrary
-            integer between :math:`0 \\leq M \\leq N`. The number of
-            dimensions of ``beta`` will be matched with ``x`` by reshaping it
-            as :math:`(1, s_1, ..., s_M, 1, ... 1)`, then ``beta`` and ``x``
+        x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable of
+            shape :math:`(s_B, s_1, s_2, ..., s_N)`, where :math:`s_B` is
+            assumed to be the *minibatch dimension*.
+        beta (:class:`~chainer.Variable` or :ref:`ndarray`): Parameter variable
+            :math:`\\beta` of shape :math:`(s_1, s_2, ..., s_M)`, where
+            :math:`M` is an arbitrary integer between
+            :math:`0 \\leq M \\leq N`. The number of dimensions of ``beta``
+            will be matched with ``x`` by reshaping it as
+            :math:`(1, s_1, ..., s_M, 1, ... 1)`, then ``beta`` and ``x``
             are multiplied together in an element-wise manner.
 
     Returns:
diff --git a/chainer/functions/activation/tanh.py b/chainer/functions/activation/tanh.py
index c4ca02eb178c..3828bf630fca 100644
--- a/chainer/functions/activation/tanh.py
+++ b/chainer/functions/activation/tanh.py
@@ -97,8 +97,7 @@ def tanh(x):
      .. math:: f(x)=\\tanh(x).
 
     Args:
-        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
-        :class:`cupy.ndarray`):
+        x (:class:`~chainer.Variable` or :ref:`ndarray`):
             Input variable. A :math:`(s_1, s_2, ..., s_N)`-shaped float array.
 
     Returns:
diff --git a/chainer/functions/activation/tree_lstm.py b/chainer/functions/activation/tree_lstm.py
index 967202648b25..6a270d96ee29 100644
--- a/chainer/functions/activation/tree_lstm.py
+++ b/chainer/functions/activation/tree_lstm.py
@@ -233,7 +233,7 @@ def tree_lstm(*inputs):
         inputs (list of :class:`~chainer.Variable`): Variable arguments which
             include all cell vectors from child-nodes, and an input vector.
             Each of the cell vectors and the input vector is
-            :class:`~chainer.Variable`.
+            :class:`~chainer.Variable` or :ref:`ndarray`.
             The input vector must have the second dimension whose size
             is (N + 3) times of that of each cell,
             where N denotes the total number of cells.
diff --git a/chainer/functions/array/spatial_transformer_sampler.py b/chainer/functions/array/spatial_transformer_sampler.py
index 29ad51189042..79271491ed17 100644
--- a/chainer/functions/array/spatial_transformer_sampler.py
+++ b/chainer/functions/array/spatial_transformer_sampler.py
@@ -23,8 +23,8 @@ def check_type_forward(self, in_types):
         x_type = in_types[0]
         grid_type = in_types[1]
         type_check.expect(
-            x_type.dtype == numpy.float32,
-            grid_type.dtype == numpy.float32,
+            x_type.dtype.kind == 'f',
+            grid_type.dtype == x_type.dtype,
             x_type.ndim == 4,
             grid_type.ndim == 4,
             grid_type.shape[1] == 2,
@@ -53,8 +53,9 @@ def forward_gpu(self, inputs):
             cuda.cupy.cudnn.create_spatial_transformer_descriptor(
                 _sampler_type, grid.dtype, len(shape), shape.ctypes.data)
 
-        one = numpy.array(1, dtype=x.dtype).ctypes
-        zero = numpy.array(0, dtype=x.dtype).ctypes
+        dtype = numpy.float64 if x.dtype == numpy.float64 else numpy.float32
+        one = numpy.array(1, dtype=dtype).ctypes
+        zero = numpy.array(0, dtype=dtype).ctypes
         libcudnn.spatialTfSamplerForward(
             handle, self.st_desc.value, one.data,
             x_desc.value, x.data.ptr, grid_t.data.ptr, zero.data,
@@ -139,8 +140,9 @@ def backward_gpu(self, inputs, grad_outputs):
         dx_desc = cudnn.create_tensor_descriptor(gx)
         dy_desc = cudnn.create_tensor_descriptor(gy)
 
-        one = numpy.array(1, dtype=x.dtype).ctypes
-        zero = numpy.array(0, dtype=x.dtype).ctypes
+        dtype = numpy.float64 if x.dtype == numpy.float64 else numpy.float32
+        one = numpy.array(1, dtype=dtype).ctypes
+        zero = numpy.array(0, dtype=dtype).ctypes
         libcudnn.spatialTfSamplerBackward(
             handle, self.st_desc.value,
             one.data,
diff --git a/chainer/functions/connection/convolution_2d.py b/chainer/functions/connection/convolution_2d.py
index 3366cd0381cd..dec147792b02 100644
--- a/chainer/functions/connection/convolution_2d.py
+++ b/chainer/functions/connection/convolution_2d.py
@@ -503,13 +503,6 @@ def convolution_2d(x, W, b=None, stride=1, pad=0, cover_all=False, **kwargs):
     When the dilation factor is greater than one, cuDNN is not used unless
     the version is 6.0 or higher.
 
-    .. warning::
-
-        ``deterministic`` argument is not supported anymore since v2.
-        Instead, use ``chainer.using_config('cudnn_deterministic', value)``
-        (value is either ``True`` or ``False``).
-        See :func:`chainer.using_config`.
-
     Args:
         x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
         :class:`cupy.ndarray`):
diff --git a/chainer/functions/connection/deconvolution_2d.py b/chainer/functions/connection/deconvolution_2d.py
index 198cef24465b..35437a8671e4 100644
--- a/chainer/functions/connection/deconvolution_2d.py
+++ b/chainer/functions/connection/deconvolution_2d.py
@@ -372,13 +372,6 @@ def deconvolution_2d(x, W, b=None, stride=1, pad=0, outsize=None, **kwargs):
     can provide a significant performance boost for fixed neural nets.
     To enable, set `chainer.using_config('autotune', True)`
 
-    .. warning::
-
-        ``deterministic`` argument is not supported anymore since v2.
-        Instead, use ``chainer.using_config('cudnn_deterministic', value)``
-        (value is either ``True`` or ``False``).
-        See :func:`chainer.using_config`.
-
     Args:
         x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
         :class:`cupy.ndarray`):
diff --git a/chainer/functions/connection/n_step_gru.py b/chainer/functions/connection/n_step_gru.py
index a75ea83cd4ac..5e2daffccd96 100644
--- a/chainer/functions/connection/n_step_gru.py
+++ b/chainer/functions/connection/n_step_gru.py
@@ -60,14 +60,6 @@ def n_step_gru(
     Note that all input variables except first layer may have different shape
     from the first layer.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): Number of layers.
         dropout_ratio(float): Dropout ratio.
@@ -161,14 +153,6 @@ def n_step_bigru(
     Note that all input variables except first layer may have different shape
     from the first layer.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): Number of layers.
         dropout_ratio(float): Dropout ratio.
@@ -230,14 +214,6 @@ def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs,
     :func:`chainer.functions.n_step_gru`.
     This function's behavior depends on argument ``use_bi_direction``.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): Number of layers.
         dropout_ratio(float): Dropout ratio.
diff --git a/chainer/functions/connection/n_step_lstm.py b/chainer/functions/connection/n_step_lstm.py
index 07f19db0bd60..1522954869f6 100644
--- a/chainer/functions/connection/n_step_lstm.py
+++ b/chainer/functions/connection/n_step_lstm.py
@@ -69,14 +69,6 @@ def n_step_lstm(
     Note that all input variables except the first layer may have different
     shape from the first layer.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): The number of layers.
         dropout_ratio(float): Dropout ratio.
@@ -230,14 +222,6 @@ def n_step_bilstm(
     Note that all input variables except the first layer may have different
     shape from the first layer.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): The number of layers.
         dropout_ratio(float): Dropout ratio.
diff --git a/chainer/functions/connection/n_step_rnn.py b/chainer/functions/connection/n_step_rnn.py
index 9759aa608865..d2de8df41001 100644
--- a/chainer/functions/connection/n_step_rnn.py
+++ b/chainer/functions/connection/n_step_rnn.py
@@ -25,32 +25,6 @@
     _cudnn_version = libcudnn.getVersion()
 
 
-class PointerArray(object):
-
-    def __init__(self, lst, back_pointer):
-        self._value = numpy.array(lst, dtype=numpy.intp)
-        # Store back_pointer to prevent the GC removes the original variable
-        self._back_pointer = back_pointer
-
-    @property
-    def data(self):
-        return self._value.ctypes.data
-
-
-def _make_tensor_descriptor_array(xs):
-    """Make an array of pointers denoting pointers of tensor descriptors.
-
-    """
-    descs = []
-    for x in xs:
-        if x.ndim < 3:
-            shape = x.shape + (1,) * (3 - x.ndim)
-            x = x.reshape(shape)
-        desc = cudnn.create_tensor_nd_descriptor(x)
-        descs.append(desc)
-    return PointerArray([d.value for d in descs], descs)
-
-
 if cuda.cudnn_enabled and _cudnn_version >= 5000:
     # Define RNN parameters using dict.
     _rnn_dirs = {
@@ -296,85 +270,20 @@ def forward(self, inputs):
         if self.use_cell:
             # LSTM
             hx, cx, w, xs = inputs
-            cx = cuda.cupy.ascontiguousarray(cx)
-            cx_desc = cudnn.create_tensor_nd_descriptor(cx)
-
-            cy = cuda.cupy.empty_like(cx)
-            cy_desc = cudnn.create_tensor_nd_descriptor(cy)
-
-            cx_data_ptr = cx.data.ptr
-            cy_data_ptr = cy.data.ptr
-
-            cx_desc_value = cx_desc.value
-            cy_desc_value = cy_desc.value
         else:
             # RNN, GRU
             hx, w, xs = inputs
-            cx = cy = None
-            cx_data_ptr = cy_data_ptr = 0
-            cx_desc_value = cy_desc_value = 0
-
-        w = cuda.cupy.ascontiguousarray(w)
-        xs = cuda.cupy.ascontiguousarray(xs)
-        hx = cuda.cupy.ascontiguousarray(hx)
-
-        length = len(self.lengths)
-        n_units = hx.shape[2]
-
-        ys = cuda.cupy.empty(
-            (len(xs), n_units * self.rnn_direction), dtype=xs.dtype)
-
-        handle = cudnn.get_handle()
-        self.handle = handle
-
-        # TODO(unno): Make a wrapper method to avoid access _desc directly
-        rnn_desc = cudnn.create_rnn_descriptor(
-            n_units, self.n_layers, self.states._desc,
-            libcudnn.CUDNN_LINEAR_INPUT, self.rnn_dir,
-            self.rnn_mode, libcudnn.CUDNN_DATA_FLOAT)
-        self.rnn_desc = rnn_desc
-
-        x_list = cuda.cupy.split(xs, self.sections[:-1])
-        c_x_descs = _make_tensor_descriptor_array(x_list)
-        hx_desc = cudnn.create_tensor_nd_descriptor(hx)
-
-        w_desc = cudnn.create_filter_descriptor(w)
-
-        self.w_desc = w_desc
-
-        y_list = cuda.cupy.split(ys, self.sections[:-1])
-        c_y_descs = _make_tensor_descriptor_array(y_list)
-        hy = cuda.cupy.empty_like(hx)
-        hy_desc = cudnn.create_tensor_nd_descriptor(hy)
-
-        work_size = libcudnn.getRNNWorkspaceSize(
-            handle, rnn_desc.value, length, c_x_descs.data)
-        workspace = cuda.cupy.empty((work_size,), dtype='b')
-        self.workspace = workspace
+            cx = None
 
         if not configuration.config.train:
-            libcudnn.RNNForwardInference(
-                handle, rnn_desc.value, length,
-                c_x_descs.data, xs.data.ptr, hx_desc.value, hx.data.ptr,
-                cx_desc_value, cx_data_ptr, w_desc.value, w.data.ptr,
-                c_y_descs.data, ys.data.ptr, hy_desc.value, hy.data.ptr,
-                cy_desc_value, cy_data_ptr, workspace.data.ptr, work_size)
+            hy, cy, ys = cudnn.rnn_forward_inference(
+                self.states, self.rnn_dir, self.rnn_mode,
+                hx, cx, w, xs, self.lengths)
 
         else:
-            reserve_size = libcudnn.getRNNTrainingReserveSize(
-                handle, rnn_desc.value, length, c_x_descs.data)
-            self.reserve_space = cuda.cupy.empty((reserve_size,), dtype='b')
-            libcudnn.RNNForwardTraining(
-                handle, rnn_desc.value, length,
-                c_x_descs.data, xs.data.ptr, hx_desc.value, hx.data.ptr,
-                cx_desc_value, cx_data_ptr, w_desc.value, w.data.ptr,
-                c_y_descs.data, ys.data.ptr, hy_desc.value, hy.data.ptr,
-                cy_desc_value, cy_data_ptr,
-                workspace.data.ptr, work_size,
-                self.reserve_space.data.ptr, reserve_size)
-
-        self.c_y_descs = c_y_descs
-        self.c_x_descs = c_x_descs
+            self.reserve_space, hy, cy, ys = cudnn.rnn_forward_training(
+                self.states, self.rnn_dir, self.rnn_mode,
+                hx, cx, w, xs, self.lengths)
 
         if self.use_cell:
             # LSTM
@@ -393,78 +302,27 @@ def backward(self, inputs, grads):
             if dcy is None:
                 dcy = cuda.cupy.zeros_like(cx)
 
-            cx = cuda.cupy.ascontiguousarray(cx)
-            dcx = cuda.cupy.empty_like(cx)
-
-            cx_desc = cudnn.create_tensor_nd_descriptor(cx)
-            dcx_desc = cudnn.create_tensor_nd_descriptor(dcx)
-            dcy_desc = cudnn.create_tensor_nd_descriptor(dcy)
-
-            cx_data_ptr = cx.data.ptr
-            dcy_data_ptr = dcy.data.ptr
-            dcx_data_ptr = dcx.data.ptr
-            cx_desc_value = cx_desc.value
-            dcx_desc_value = dcx_desc.value
-            dcy_desc_value = dcy_desc.value
         else:
             # GRU, RNN
             hx, w, xs = inputs
             dhy, dys = grads
-            dcy = cx = dcx = None
-            cx_data_ptr = dcy_data_ptr = dcx_data_ptr = 0
-            cx_desc_value = dcx_desc_value = dcy_desc_value = 0
+            dcy = cx = None
 
         ys = self.output_data[-1]
 
-        xs = cuda.cupy.ascontiguousarray(xs)
-        hx = cuda.cupy.ascontiguousarray(hx)
-
         if dhy is None:
             dhy = cuda.cupy.zeros_like(hx)
-
         if dys is None:
             dys = cuda.cupy.zeros_like(ys)
 
-        length = len(self.lengths)
+        dhx, dcx, dxs = cudnn.rnn_backward_data(
+            self.states, self.rnn_dir, self.rnn_mode,
+            hx, cx, w, xs, ys, self.reserve_space,
+            dhy, dcy, dys, self.lengths)
 
-        dhx = cuda.cupy.empty_like(hx)
-
-        hx_desc = cudnn.create_tensor_nd_descriptor(hx)
-        dhy_desc = cudnn.create_tensor_nd_descriptor(dhy)
-
-        dy_list = cuda.cupy.split(dys, self.sections[:-1], 0)
-        c_dy_descs = _make_tensor_descriptor_array(dy_list)
-
-        rnn_desc = self.rnn_desc
-        handle = self.handle
-        work_size = libcudnn.getRNNWorkspaceSize(
-            handle, rnn_desc.value, length, self.c_x_descs.data)
-        workspace = cuda.cupy.empty((work_size,), dtype='b')
-
-        dhx_desc = cudnn.create_tensor_nd_descriptor(dhx)
-
-        dxs = cuda.cupy.empty_like(xs)
-        dx_list = cuda.cupy.split(dxs, self.sections[:-1], 0)
-        c_dx_descs = _make_tensor_descriptor_array(dx_list)
-
-        libcudnn.RNNBackwardData(
-            handle, rnn_desc.value, length,
-            self.c_y_descs.data, ys.data.ptr,
-            c_dy_descs.data, dys.data.ptr, dhy_desc.value, dhy.data.ptr,
-            dcy_desc_value, dcy_data_ptr, self.w_desc.value, w.data.ptr,
-            hx_desc.value, hx.data.ptr, cx_desc_value, cx_data_ptr,
-            c_dx_descs.data, dxs.data.ptr, dhx_desc.value, dhx.data.ptr,
-            dcx_desc_value, dcx_data_ptr, workspace.data.ptr, work_size,
-            self.reserve_space.data.ptr, self.reserve_space.size)
-
-        dw = cuda.cupy.zeros_like(w)
-        dw_desc = cudnn.create_filter_descriptor(dw)
-        libcudnn.RNNBackwardWeights(
-            handle, rnn_desc.value, length,
-            self.c_x_descs.data, xs.data.ptr,
-            hx_desc.value, hx.data.ptr, self.c_y_descs.data, ys.data.ptr,
-            workspace.data.ptr, work_size, dw_desc.value, dw.data.ptr,
-            self.reserve_space.data.ptr, self.reserve_space.size)
+        dw = cudnn.rnn_backward_weights(
+            self.states, self.rnn_dir, self.rnn_mode,
+            xs, hx, ys, w, self.reserve_space, self.lengths)
 
         if self.use_cell:
             # LSTM
@@ -542,14 +400,6 @@ def n_step_rnn(
     Note that all input variables except first layer may have different shape
     from the first layer.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): Number of layers.
         dropout_ratio(float): Dropout ratio.
@@ -645,14 +495,6 @@ def n_step_birnn(
     Note that all input variables except first layer may have different shape
     from the first layer.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): Number of layers.
         dropout_ratio(float): Dropout ratio.
@@ -721,14 +563,6 @@ def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs,
     This function's behavior depends on following arguments,
     ``activation`` and ``use_bi_direction``.
 
-    .. warning::
-
-       ``train`` and ``use_cudnn`` arguments are not supported anymore since
-       v2.
-       Instead, use ``chainer.using_config('train', train)`` and
-       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers(int): Number of layers.
         dropout_ratio(float): Dropout ratio.
diff --git a/chainer/functions/loss/huber_loss.py b/chainer/functions/loss/huber_loss.py
index 38801938c00d..21ae8a5caf83 100644
--- a/chainer/functions/loss/huber_loss.py
+++ b/chainer/functions/loss/huber_loss.py
@@ -48,8 +48,7 @@ def backward(self, indexes, grad_outputs):
         x0, x1 = self.get_retained_inputs()
         gy, = grad_outputs
         diff = x0 - x1
-        # `functions.clip` only accepts float value.
-        delta = float(self.delta)
+        delta = self.delta
 
         gx = chainer.functions.clip(diff, -delta, delta)
 
diff --git a/chainer/functions/math/clip.py b/chainer/functions/math/clip.py
index a30382cd8425..2ba478b6eff1 100644
--- a/chainer/functions/math/clip.py
+++ b/chainer/functions/math/clip.py
@@ -10,12 +10,9 @@ class Clip(function_node.FunctionNode):
     """Clips (limits) elements of input variable."""
 
     def __init__(self, x_min, x_max):
-        if not isinstance(x_min, float):
-            raise TypeError('x_min must be float value')
-        if not isinstance(x_max, float):
-            raise TypeError('x_max must be float value')
-        # x_min must be lesser than x_max.
-        assert x_min < x_max
+        # x_min must be less than x_max.
+        if x_min >= x_max:
+            raise ValueError('x_min must be less than x_max.')
         self.x_min = x_min
         self.x_max = x_max
 
diff --git a/chainer/functions/noise/dropout.py b/chainer/functions/noise/dropout.py
index 7fb735c92966..90ca01339cce 100644
--- a/chainer/functions/noise/dropout.py
+++ b/chainer/functions/noise/dropout.py
@@ -135,12 +135,6 @@ def dropout(x, ratio=.5, **kwargs):
     mode (i.e., ``chainer.config.train`` is set to ``False``), it does nothing
     and just returns ``x``.
 
-    .. warning::
-
-       ``train`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('train', boolean)``.
-       See :func:`chainer.using_config`.
-
     Args:
         x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
         :class:`cupy.ndarray`):
diff --git a/chainer/functions/noise/zoneout.py b/chainer/functions/noise/zoneout.py
index e17e9506ede4..35c94e93d5a8 100644
--- a/chainer/functions/noise/zoneout.py
+++ b/chainer/functions/noise/zoneout.py
@@ -50,12 +50,6 @@ def zoneout(h, x, ratio=.5, **kwargs):
     instead sets dropping element to their previous variable. In testing mode ,
     it does nothing and just returns ``x``.
 
-    .. warning::
-
-       ``train`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('train', train)``.
-       See :func:`chainer.using_config`.
-
     Args:
         h (~chainer.Variable): Previous variable.
         x (~chainer.Variable): Input variable.
diff --git a/chainer/functions/normalization/batch_normalization.py b/chainer/functions/normalization/batch_normalization.py
index 24c8a1894365..3e04470114c0 100644
--- a/chainer/functions/normalization/batch_normalization.py
+++ b/chainer/functions/normalization/batch_normalization.py
@@ -8,7 +8,6 @@
 from chainer.backends import intel64
 from chainer import configuration
 from chainer import function_node
-from chainer import utils
 from chainer.utils import argument
 from chainer.utils import collections_abc
 from chainer.utils import type_check
@@ -201,82 +200,16 @@ def forward(self, inputs):
                 y = numpy.squeeze(y, axis=(2, 3))
 
         elif self.use_cudnn:
-            # TODO(niboshi): Refactor cuDNN part into a separate method
-            x = cuda.cupy.ascontiguousarray(x)
-
-            gamma = cuda.cupy.ascontiguousarray(gamma)
-            beta = cuda.cupy.ascontiguousarray(beta)
-            dtype = x.dtype
-            handle = cudnn.get_handle()
-            x_desc = cudnn.create_tensor_descriptor(
-                _as4darray(x, self.mode))
-            cudnn_mode = self.mode.get_cudnn_mode()
-            derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor()
-            libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value,
-                                              x_desc.value, cudnn_mode)
-            dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc)
-            if dtype_param is not dtype:
-                gamma = gamma.astype(dtype_param)
-                beta = beta.astype(dtype_param)
-                running_mean = self.running_mean.astype(dtype_param)
-                running_var = self.running_var.astype(dtype_param)
-            else:
-                running_mean = self.running_mean
-                running_var = self.running_var
-
-            oz_dtype = (
-                numpy.float64 if x.dtype == numpy.float64 else numpy.float32)
-            one = numpy.array(1, dtype=oz_dtype).ctypes
-            zero = numpy.array(0, dtype=oz_dtype).ctypes
-            y = cuda.cupy.empty_like(x)
-            # Factor used in the moving average
-            factor = 1 - self.decay
-
             if self.mean is None:
                 # Output cache to speed up backward pass.
                 self.mean = xp.empty_like(gamma)
                 # Output cache to speed up backward pass.
                 self.inv_std = xp.empty_like(gamma)
-            # Note: cuDNN computes the mini-batch mean and variance
-            # internally. We can simply (optionally) pass
-            # it the running-average mean and variance arrays.
-            # Note: This API seems to set the inverse of the standard deviation
-            # (instead of variance) to resultSaveInvVariance argument. The
-            # current implementation of our BN depends on this behavior so that
-            # we can reduce the number of reduction kernels.
-            libcudnn.batchNormalizationForwardTraining(
-                handle, cudnn_mode, one.data, zero.data,
-                x_desc.value, x.data.ptr, x_desc.value,
-                y.data.ptr, derivedBnDesc.value, gamma.data.ptr,
-                beta.data.ptr, factor, running_mean.data.ptr,
-                running_var.data.ptr, self.eps,
-                self.mean.data.ptr, self.inv_std.data.ptr)
-
-            # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used,
-            # there is a possibility of numerical overflow. You can use
-            # queryRuntimeError() to make sure whether the overflow actually
-            # occured or not during the batch normalization.
-            if (cudnn_mode is libcudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT and
-                    configuration.config.debug):
-                query_mode = libcudnn.CUDNN_ERRQUERY_BLOCKING
-                rstatus = libcudnn.queryRuntimeError(handle, query_mode)
-                if rstatus is not libcudnn.CUDNN_STATUS_SUCCESS:
-                    warnings.warn(
-                        'A numerical overflow might have happend in cuDNN'
-                        'batch normalization (status:{})'.format(rstatus))
-
-            if dtype_param is not dtype:
-                # When data type of prameters is converted, say, from fp16
-                # to fp32, the values of fp32 arrays of running_mean and
-                # running_var updated by batchNormalizationForwardTraining
-                # must be explicitly written back to their original fp16
-                # arrays.
-                running_mean = running_mean.astype(dtype)
-                running_var = running_var.astype(dtype)
-                self.running_mean.data.copy_from(running_mean.data,
-                                                 running_mean.nbytes)
-                self.running_var.data.copy_from(running_var.data,
-                                                running_var.nbytes)
+            y = cudnn.batch_normalization_forward_training(
+                x, gamma, beta, self.running_mean, self.running_var,
+                self.mean, self.inv_std, self.eps, self.decay,
+                self.mode.is_for_conv2d, self.mode.get_cudnn_mode(),
+                configuration.config.debug)
         else:
             # Generic CPU and GPU implementation
 
@@ -371,52 +304,10 @@ def forward(self, inputs):
                 gx = numpy.squeeze(gx, axis=(2, 3))
 
         elif self.use_cudnn:
-            # TODO(niboshi): Refactor cuDNN part into a separate method
-            x = cuda.cupy.ascontiguousarray(x)
-            gamma = cuda.cupy.ascontiguousarray(gamma)
-            gy = cuda.cupy.ascontiguousarray(gy)
-            dtype = x.dtype
-            handle = cudnn.get_handle()
-            x_desc = cudnn.create_tensor_descriptor(
-                _as4darray(x, self.mode))
-            cudnn_mode = self.mode.get_cudnn_mode()
-            derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor()
-            libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value,
-                                              x_desc.value, cudnn_mode)
-            dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc)
-            if dtype_param is not dtype:
-                gamma = gamma.astype(dtype_param)
-            oz_dtype = (
-                numpy.float64 if x.dtype == numpy.float64 else numpy.float32)
-            one = numpy.array(1, dtype=oz_dtype).ctypes
-            zero = numpy.array(0, dtype=oz_dtype).ctypes
-            gx = cuda.cupy.empty_like(x)
-            ggamma = cuda.cupy.empty_like(gamma)
-            gbeta = cuda.cupy.empty_like(gamma)
-            libcudnn.batchNormalizationBackward(
-                handle, cudnn_mode, one.data, zero.data,
-                one.data, zero.data, x_desc.value, x.data.ptr,
-                x_desc.value, gy.data.ptr, x_desc.value, gx.data.ptr,
-                derivedBnDesc.value, gamma.data.ptr,
-                ggamma.data.ptr, gbeta.data.ptr,
-                self.eps, self.mean.data.ptr, self.inv_std.data.ptr)
-
-            # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used,
-            # there is a possibility of numerical overflow. You can use
-            # queryRuntimeError() to make sure whether the overflow actually
-            # occured or not during the batch normalization.
-            if (cudnn_mode is libcudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT and
-                    configuration.config.debug):
-                query_mode = libcudnn.CUDNN_ERRQUERY_BLOCKING
-                rstatus = libcudnn.queryRuntimeError(handle, query_mode)
-                if rstatus is not libcudnn.CUDNN_STATUS_SUCCESS:
-                    warnings.warn(
-                        'A numerical overflow might have happend in cuDNN'
-                        'batch normalization (status:{})'.format(rstatus))
-
-            if dtype_param is not dtype:
-                ggamma = ggamma.astype(dtype)
-                gbeta = gbeta.astype(dtype)
+            gx, ggamma, gbeta = cudnn.batch_normalization_backward(
+                x, gamma, gy, self.mean, self.inv_std, self.eps,
+                self.mode.is_for_conv2d, self.mode.get_cudnn_mode(),
+                configuration.config.debug)
         else:
             # CPU and GPU implementation
             gbeta = gy.sum(axis=self.axis)
@@ -593,36 +484,9 @@ def forward(self, inputs):
             self.inv_std = None
 
         elif mode.can_use_cudnn(xp):
-            # TODO(niboshi): Refactor cuDNN part into a separate method
-            x = cuda.cupy.ascontiguousarray(x)
-
-            gamma = cuda.cupy.ascontiguousarray(gamma)
-            beta = cuda.cupy.ascontiguousarray(beta)
-            dtype = x.dtype
-            handle = cudnn.get_handle()
-            x_desc = cudnn.create_tensor_descriptor(
-                _as4darray(x, mode))
-            cudnn_mode = mode.get_cudnn_mode()
-            derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor()
-            libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value,
-                                              x_desc.value, cudnn_mode)
-            dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc)
-            if dtype_param is not dtype:
-                gamma = gamma.astype(dtype_param)
-                beta = beta.astype(dtype_param)
-                mean = mean.astype(dtype_param)
-                var = var.astype(dtype_param)
-            oz_dtype = (
-                numpy.float64 if x.dtype == numpy.float64 else numpy.float32)
-            one = numpy.array(1, dtype=oz_dtype).ctypes
-            zero = numpy.array(0, dtype=oz_dtype).ctypes
-            y = cuda.cupy.empty_like(x)
-
-            libcudnn.batchNormalizationForwardInference(
-                handle, cudnn_mode, one.data, zero.data,
-                x_desc.value, x.data.ptr, x_desc.value, y.data.ptr,
-                derivedBnDesc.value, gamma.data.ptr, beta.data.ptr,
-                mean.data.ptr, var.data.ptr, self.eps)
+            y = cudnn.batch_normalization_forward_inference(
+                x, gamma, beta, mean, var, self.eps,
+                mode.is_for_conv2d, mode.get_cudnn_mode())
         else:
             # Generic CPU and GPU implementation
             gamma = gamma[expander]
@@ -755,59 +619,12 @@ def can_use_cudnn(self, xp):
                 self.cudnn_dtype_ok)
 
 
-def _as4darray(arr, mode):
-    assert mode.cudnn_dim_ok
-    if mode.is_for_conv2d:
-        assert arr.ndim == 4
-        return arr
-    else:  # is_for_linear
-        return arr.reshape(utils.size_of_shape(arr.shape[0:-1]), -1, 1, 1)
-
-
 def _x_hat(x, mean, inv_std):
     x_mu = x - mean
     x_mu *= inv_std
     return x_mu
 
 
-def _apply_bn_fwd(xp, x, mean, inv_std, gamma, beta):
-    # NOTE: all arguments should be broadcasted to x.shape
-    # (mean, inv_std, gamma, and beta have to already be expanded)
-    if xp is numpy:
-        x_hat = _x_hat(x, mean, inv_std)
-        y = gamma * x_hat
-        y += beta
-    else:
-        y = cuda.elementwise(
-            'T x, T mean, T inv_std, T gamma, T beta', 'T y',
-            'y = gamma * (x - mean) * inv_std + beta', 'bn_fwd'
-        )(x, mean, inv_std, gamma, beta)
-    return y
-
-
-def _zero_if_none(xp, x, shape, dtype):
-    # TODO(Tokui): Return broadcasted 0 instead of a zeroed array.
-    if x is None:
-        return xp.zeros(shape, dtype=dtype)
-    return x
-
-
-def _get_dtype_of_tensor_descriptor(desc):
-    cudnn_dtype, _, _, _, _, _, _, _, _ = libcudnn.getTensor4dDescriptor(
-        desc.value)
-    dtype = None
-    if cudnn_dtype == libcudnn.CUDNN_DATA_DOUBLE:
-        dtype = numpy.dtype(numpy.float64)
-    elif cudnn_dtype == libcudnn.CUDNN_DATA_FLOAT:
-        dtype = numpy.dtype(numpy.float32)
-    elif cudnn_dtype == libcudnn.CUDNN_DATA_HALF:
-        dtype = numpy.dtype(numpy.float16)
-    else:
-        msg = 'Unknow cudnn data type {} '.format(cudnn_dtype)
-        raise RuntimeError(msg)
-    return dtype
-
-
 def _chainerx_compute_axis(x_ndim, gamma_ndim, axis):
     # Returns processed axis for ChainerX.
     axis_chx = (
@@ -832,6 +649,28 @@ def _chainerx_is_supported(device, axis_chx):
     return True
 
 
+def _apply_bn_fwd(xp, x, mean, inv_std, gamma, beta):
+    # NOTE: all arguments should be broadcasted to x.shape
+    # (mean, inv_std, gamma, and beta have to already be expanded)
+    if xp is numpy:
+        x_hat = _x_hat(x, mean, inv_std)
+        y = gamma * x_hat
+        y += beta
+    else:
+        y = cuda.elementwise(
+            'T x, T mean, T inv_std, T gamma, T beta', 'T y',
+            'y = gamma * (x - mean) * inv_std + beta', 'bn_fwd'
+        )(x, mean, inv_std, gamma, beta)
+    return y
+
+
+def _zero_if_none(xp, x, shape, dtype):
+    # TODO(Tokui): Return broadcasted 0 instead of a zeroed array.
+    if x is None:
+        return xp.zeros(shape, dtype=dtype)
+    return x
+
+
 def batch_normalization(x, gamma, beta, **kwargs):
     """batch_normalization(x, gamma, beta, eps=2e-5, running_mean=None, running_var=None, decay=0.9, axis=None)
 
@@ -858,12 +697,6 @@ def batch_normalization(x, gamma, beta, **kwargs):
     the total batch size will be considered to be the product of all
     input dimensions except the second dimension.
 
-    .. warning::
-
-       ``train`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('train', train)``.
-       See :func:`chainer.using_config`.
-
     Args:
         x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
         :class:`cupy.ndarray`): Input variable.
diff --git a/chainer/functions/normalization/batch_renormalization.py b/chainer/functions/normalization/batch_renormalization.py
index 2da4065e3147..701ded49a9de 100644
--- a/chainer/functions/normalization/batch_renormalization.py
+++ b/chainer/functions/normalization/batch_renormalization.py
@@ -10,15 +10,6 @@
 from chainer.utils import type_check
 
 
-def _as4darray(arr):
-    if arr.ndim == 0:
-        return arr.reshape(1, 1, 1, 1)
-    elif arr.ndim == 4:
-        return arr
-    else:
-        return arr.reshape(arr.shape[0], -1, 1, 1)
-
-
 def _xhat(x, mean, std, expander):
     x_mu = x - mean[expander]
     x_mu /= std[expander]
diff --git a/chainer/functions/util/forget.py b/chainer/functions/util/forget.py
index 207812bb12de..1f73e16810a3 100644
--- a/chainer/functions/util/forget.py
+++ b/chainer/functions/util/forget.py
@@ -55,13 +55,11 @@ def backward(self, indexes, grad_outputs):
                 chainer.using_config('in_recomputing', True):
             outs = _call_func(self.func, dummy_inputs)
             assert len(outs) == len(grad_outputs)
-            if len(outs) > 1:
-                # Avoid doing backward multiple times when `outs` is a tuple
-                outs = chainer.functions.identity(*outs)
 
         for out, grad_output in zip(outs, grad_outputs):
             out.grad_var = grad_output
-        outs[0].backward()
+        # TODO(kataoka): use outer backward's `retain_grad` and `loss_scale`
+        chainer.variable._backprop_to_all(outs, False, None)
 
         return tuple([inp.grad_var for inp in dummy_inputs])
 
diff --git a/chainer/gradient_check.py b/chainer/gradient_check.py
index c1b8fa29f9de..c43917baf79e 100644
--- a/chainer/gradient_check.py
+++ b/chainer/gradient_check.py
@@ -10,7 +10,6 @@
 from chainer import configuration
 from chainer import FunctionNode
 from chainer import testing
-from chainer import utils
 from chainer import variable
 import chainerx
 
@@ -29,6 +28,44 @@ def _copy_arrays(xs):
         return [xp.array(x, dtype=numpy.float64, copy=True) for x in xs]
 
 
+def _ones_like(arr):
+    device = cuda.get_device_from_array(arr)
+    xp = backend.get_array_module(arr)
+    with device:
+        return xp.ones_like(arr)
+
+
+def _make_outputs_props_in_error_message(outputs, grad_outputs):
+    def format_props(arr):
+        return '{}:{}'.format(arr.shape, arr.dtype.name)
+
+    return (
+        'Output shapes and dtypes         : {}\n'
+        'Output gradient shapes and dtypes: {}'.format(
+            ', '.join(format_props(y) for y in outputs),
+            ', '.join('None' if gy is None else format_props(gy)
+                      for gy in grad_outputs)))
+
+
+def _check_outputs_and_grad_outputs(outputs, grad_outputs):
+    if len(outputs) != len(grad_outputs):
+        raise ValueError(
+            'Output gradients must contain equally as many elements as '
+            'the number of output elements.\n'
+            '{}'.format(
+                _make_outputs_props_in_error_message(outputs, grad_outputs)))
+    shapes_match = all([gy is None or y.shape == gy.shape
+                        for y, gy in zip(outputs, grad_outputs)])
+    dtypes_match = all([gy is None or y.dtype == gy.dtype
+                        for y, gy in zip(outputs, grad_outputs)])
+    if not (shapes_match and dtypes_match):
+        raise ValueError(
+            'Shapes and/or dtypes of outputs and output gradients do not '
+            'match.\n'
+            '{}'.format(
+                _make_outputs_props_in_error_message(outputs, grad_outputs)))
+
+
 def numerical_grad(
         f, inputs, grad_outputs, eps=1e-3,
         detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2,
@@ -79,6 +116,7 @@ def numerical_grad(
 
     """
     assert eps > 0
+    assert isinstance(inputs, (tuple, list))
     for x in inputs:
         if x.dtype.kind != 'f':
             raise RuntimeError(
@@ -124,13 +162,13 @@ def numerical_grad(
 
     # Evaluate func at a single input
     def eval_func(x, i, delta, orig):
-        utils._setitem(x, i, orig + delta)
+        x[i] = orig + delta
         y = _copy_arrays(f())
         assert len(y) == len(grad_outputs)
         assert all([
             gy is None or numpy.isscalar(gy) or y_.shape == gy.shape
             for y_, gy in zip(y, grad_outputs)])
-        utils._setitem(x, i, orig)
+        x[i] = orig
         return y
 
     # An iteration on a single input displacement
@@ -262,7 +300,7 @@ def iterate_single_input(i_in, x, orig_x, i):
                         y1, y0, xp.asarray(gy), eps, gx[i])
                 else:
                     dot = ((y1 - y0) * gy).sum()
-                    utils._setitem(gx, i, gx[i] + dot / (2 * eps))
+                    gx[i] = gx[i] + dot / (2 * eps)
             elif len(yss) == 5:  # 3rd order
                 y0 = yss[0][i_out]
                 y1 = yss[1][i_out]
@@ -274,7 +312,7 @@ def iterate_single_input(i_in, x, orig_x, i):
                 else:
                     num = -y3 + 8 * y2 - 8 * y1 + y0
                     dot = (num * gy).sum()
-                    utils._setitem(gx, i, gx[i] + dot / (6 * eps))
+                    gx[i] = gx[i] + dot / (6 * eps)
             else:
                 assert False
 
@@ -376,21 +414,31 @@ def _run(self):
         # This must be done before sampling a direction vector, because
         # otherwise the shapes of uninitialized parameters wouldn't be
         # determined.
-        xs_backward, y_backward, y0_data, y_grad = (
+        xs_backward, ys = (
             self._forward_for_backward_gradients())
-        self.y_grad = y_grad
+        # Keep output arrays to save computation in numerical gradients
+        y0_data = tuple([y.array for y in ys])
+
+        # If y_grad is not given, generate the all-1 gradients.
+        if self.y_grad is None:
+            if not (len(ys) == 1 and ys[0].shape == ()):
+                raise ValueError(
+                    'y_grad argument cannot be omitted if the target function '
+                    'is not a loss function, which has a single output with '
+                    'shape ().\n'
+                    'Actual output shapes: {}'.format(
+                        ', '.join([str(y.shape) for y in ys])))
+
+            self.y_grad = tuple([_ones_like(y.array) for y in ys])
+        else:
+            _check_outputs_and_grad_outputs(ys, self.y_grad)
 
         # Sample a direction vector.
         directions = self._sample_directions()
 
         # Compute backward gradients by running a backward pass.
         gx_backward = self._directional_backward_gradients(
-            xs_backward, y_backward, directions)
-
-        # If no input has a gradient, we don't need to compare with numeric
-        # gradient.
-        if len(self.x_data) + len(self.params) == self.no_grads.count(True):
-            return
+            xs_backward, ys, directions)
 
         # Compute numeric gradients
         gx_numeric = self._directional_numeric_gradients(directions, y0_data)
@@ -466,36 +514,35 @@ def _clear_grads(self, xs):
     def _forward_for_backward_gradients(self):
         func = self.func
         x_data = self.x_data
-        y_grad = self.y_grad
         params = self.params
 
         xs = [variable.Variable(x) for x in x_data]
         y = func(*xs)
         y = _as_tuple(y)
-        y0_data = [_.data for _ in y]
-
-        # All creators of `y` need to be the same because we only call
-        # `y[0].backward` to call `backward` method of the creator.
-        # To do so we need to insert a dummy function `_GradientSetter` to the
-        # computational graph.
-        # Note that `func` may not be a `Function` object.
-
-        y, y_grad = _set_y_grad(y, y_grad)
 
         # Clear gradients which may exist if func calls backward inside of
         # itself.
         self._clear_grads(xs)
         self._clear_grads(params)
+        return xs, y
 
-        return xs, y, y0_data, y_grad
-
-    def _directional_backward_gradients(self, xs, y, directions):
+    def _directional_backward_gradients(self, xs, ys, directions):
         params = self.params
         no_grads = self.no_grads
 
-        # We only need to call `backward` for one result `Variable`.
-        # `Variable.backward` method calls `Function.backward` of its creator.
-        y.backward()
+        # We need to start backprop from a single variable,
+        # so a dummy function `_GradientSetter` is inserted at the end of the
+        # computational graph.
+        y_backward = _apply_grad_setter_func(
+            ys,
+            [None if gy is None
+             # Copy is needed to avoid being updated during backprop, which
+             # would affect the numerical gradient.
+             else chainer.Variable(gy.copy(), requires_grad=False)
+             for gy in self.y_grad])
+
+        # Backward
+        y_backward.backward()
 
         for no_grad, x in six.moves.zip(no_grads, xs):
             if no_grad and x.grad is not None:
@@ -621,7 +668,7 @@ def func(xs):
         gy_data = xp.array(...)
         check_backward(func, (x1_data, x2_data), gy_data)
 
-    This method creates :class:`~chainer.Variable` objects with ``x_data``
+    This function creates :class:`~chainer.Variable` objects with ``x_data``
     and calls ``func`` with the :class:`~chainer.Variable`\\ s to get its
     result as :class:`~chainer.Variable`.
     Then, it sets ``y_grad`` array to ``grad`` attribute of the result and
@@ -802,10 +849,11 @@ def first_order_grad(*inputs):
         gys = inputs[n_x:]
 
         y = _as_tuple(func(*xs))
+        _check_outputs_and_grad_outputs(y, gys)
 
         # Let all elements of y share the same creator.
         # See the comment in check_backward.
-        y, _ = _set_y_grad(y, gys)
+        y = _apply_grad_setter_func(y, gys)
 
         y.backward(enable_double_backprop=True)
 
@@ -852,48 +900,26 @@ def first_order_grad(*inputs):
 
 
 class _GradientSetter(FunctionNode):
-    input_shape = None
-    input_dtype = None
-
-    def __init__(self, xp, grad):
-        self.xp = xp
+    def __init__(self, grad):
         self.grad = grad
 
     def forward(self, inputs):
-        self.input_shape = inputs[0].shape
-        self.input_dtype = inputs[0].dtype
-
-        # output a 0-sized 1-dim array like inputs
-        # xp can be different from self.xp for ChainerX fallback.
-        xp = backend.get_array_module(*inputs)
-        return xp.empty((0,), dtype=inputs[0].dtype),
-
-    def backward(self, indexes, grad_outputs):
-        if self.grad is None:
-            grad = (self.xp.ones(self.input_shape, self.input_dtype),)
-        else:
-            grad = self.grad
-
-        return tuple(
-            None if g is None else variable.as_variable(g)
-            for g in grad)
-
-
-def _set_y_grad(y, y_grad):
-    xp = backend.get_array_module(*y)
-    if y_grad is not None:
-        if len(y) != len(y_grad):
-            raise ValueError(
-                'Upstream gradients must contain equally many elements as '
-                'number of output elements.\n'
-                'Actual: {} != {}'.format(len(y), len(y_grad)))
-        y, = _GradientSetter(xp, y_grad).apply(y)
-    else:
-        if len(y) != 1 or y[0].shape != ():
-            raise ValueError(
-                'Function must return a zero-dimensional array of length 1 '
-                'if the upstream gradient is `None`.\n'
-                'Actual: {} != 1'.format(len(y)))
-        y, = _GradientSetter(xp, None).apply(y)
-        y_grad = (1,)
-    return y, y_grad
+        # output a dummy array.
+        xp = backend.get_array_module(inputs[0])
+        return xp.empty((0,), dtype=numpy.float32),
+
+    def backward(self, inputs, grad_outputs):
+        return self.grad
+
+
+def _apply_grad_setter_func(y, y_grad):
+    # Applies the `_GradientSetter` function.
+    # The gradient setter function accepts any number of upstream outputs as
+    # its inputs, and returns a single output variable with dummy data.
+    # This variable will be later backward()ed and during backprop, this
+    # function returns the given gradients (`y_grad`) on its backward.
+    assert len(y) == len(y_grad)
+    assert all(isinstance(y_, chainer.Variable) for y_ in y)
+    assert all(gy is None or isinstance(gy, chainer.Variable) for gy in y_grad)
+    y, = _GradientSetter(y_grad).apply(y)
+    return y
diff --git a/chainer/iterators/multiprocess_iterator.py b/chainer/iterators/multiprocess_iterator.py
index d5c0460effa5..bacd6f2b40e2 100644
--- a/chainer/iterators/multiprocess_iterator.py
+++ b/chainer/iterators/multiprocess_iterator.py
@@ -186,6 +186,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.finalize()
 
     def __copy__(self):
+        # This function is implemented for backward compatibility.
+        # Please use `reset` normally.
         other = MultiprocessIterator(
             self.dataset, self.batch_size, self.repeat, shuffle=None,
             n_processes=self.n_processes, n_prefetch=self.n_prefetch,
diff --git a/chainer/link.py b/chainer/link.py
index e1717f938252..d9f5d2039682 100644
--- a/chainer/link.py
+++ b/chainer/link.py
@@ -671,13 +671,11 @@ def zerograds(self):
         # type: () -> None
         """Initializes all gradient arrays by zero.
 
-        This method can be used for the same purpose of cleargrads, but less
-        efficient. This method is left for backward compatibility.
-
-        .. deprecated:: v1.15
-           Use :meth:`cleargrads` instead.
+         .. deprecated:: v1.15
+            Use the more efficient :meth:`cleargrads` instead.
 
         """
+
         warnings.warn(
             'Link.zerograds is deprecated. Use Link.cleargrads instead.',
             DeprecationWarning)
diff --git a/chainer/links/caffe/caffe_function.py b/chainer/links/caffe/caffe_function.py
index f03448e2cdf2..3184badfab70 100644
--- a/chainer/links/caffe/caffe_function.py
+++ b/chainer/links/caffe/caffe_function.py
@@ -183,12 +183,6 @@ def forward(self, inputs, outputs, disable=(), **kwargs):
         bottom blobs are already computed, then emulates the layer and stores
         output blobs as :class:`~chainer.Variable` objects.
 
-        .. warning::
-
-           ``train`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', train)``.
-           See :func:`chainer.using_config`.
-
         Args:
             inputs (dict): A dictionary whose key-value pairs indicate initial
                 correspondences between blob names and
diff --git a/chainer/links/connection/convolution_2d.py b/chainer/links/connection/convolution_2d.py
index d08765f2d4b9..5d880398cb03 100644
--- a/chainer/links/connection/convolution_2d.py
+++ b/chainer/links/connection/convolution_2d.py
@@ -23,13 +23,6 @@ class Convolution2D(link.Link):
     can provide a significant performance boost for fixed neural nets.
     To enable, set `chainer.using_config('autotune', True)`
 
-    .. warning::
-
-        ``deterministic`` argument is not supported anymore since v2.
-        Instead, use ``chainer.using_config('cudnn_deterministic', value``
-        (value is either ``True`` or ``False``).
-        See :func:`chainer.using_config`.
-
     Args:
         in_channels (int or None): Number of channels of input arrays.
             If ``None``, parameter initialization will be deferred until the
diff --git a/chainer/links/connection/deconvolution_2d.py b/chainer/links/connection/deconvolution_2d.py
index 28ee061bf657..3dd6cd112ccb 100644
--- a/chainer/links/connection/deconvolution_2d.py
+++ b/chainer/links/connection/deconvolution_2d.py
@@ -22,13 +22,6 @@ class Deconvolution2D(link.Link):
     can provide a significant performance boost for fixed neural nets.
     To enable, set `chainer.using_config('autotune', True)`
 
-    .. warning::
-
-        ``deterministic`` argument is not supported anymore since v2.
-        Instead, use ``chainer.using_config('cudnn_deterministic', value)``
-        (value is either ``True`` or ``False``).
-        See :func:`chainer.using_config`.
-
     Args:
         in_channels (int or None): Number of channels of input arrays.
             If ``None``, parameter initialization will be deferred until the
diff --git a/chainer/links/connection/gru.py b/chainer/links/connection/gru.py
index 51c6bea5e5c6..44fda3806f0b 100644
--- a/chainer/links/connection/gru.py
+++ b/chainer/links/connection/gru.py
@@ -245,19 +245,6 @@ class GRU(StatefulGRU):
 
     This is an alias of :class:`~chainer.links.StatefulGRU`.
 
-    .. warning::
-
-       In Chainer v1, ``GRU`` was *stateless*,
-       as opposed to the current implementation.
-       To align with LSTM links, we have changed
-       the naming convention from Chainer v2 so that the shorthand name
-       points the stateful links.
-       You can use :class:`~chainer.links.StatelessGRU` for stateless version,
-       whose implementation is identical to ``GRU`` in v1.
-
-       See issue `#2537 <https://github.com/chainer/chainer/issues/2537>`_
-       for details.
-
     """
 
     def forward(self, *args):
diff --git a/chainer/links/connection/mlp_convolution_2d.py b/chainer/links/connection/mlp_convolution_2d.py
index e116c4dfdfd4..14b6c44d48b0 100644
--- a/chainer/links/connection/mlp_convolution_2d.py
+++ b/chainer/links/connection/mlp_convolution_2d.py
@@ -53,12 +53,6 @@ class MLPConvolution2D(link.ChainList):
             passed to the convolution layers. This option must be specified as
             a keyword argument.
 
-    .. note:
-        From v2, `conv_init` and `bias_init` arguments must be specified as
-        keyword arguments only. We impose this restriction to forbid
-        users to assume the API for v1 and specify `wscale` option,
-        that had been between `activation` and `conv_init` arguments in v1.
-
     See: `Network in Network <https://arxiv.org/abs/1312.4400v3>`_.
 
     Attributes:
diff --git a/chainer/links/connection/n_step_gru.py b/chainer/links/connection/n_step_gru.py
index da84b027a479..6a7c9666d37d 100644
--- a/chainer/links/connection/n_step_gru.py
+++ b/chainer/links/connection/n_step_gru.py
@@ -12,12 +12,6 @@ class NStepGRUBase(n_step_rnn.NStepRNNBase):
     :func:`chainer.links.NStepBiRNN`.
     This link's behavior depends on argument, ``use_bi_direction``.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -49,12 +43,6 @@ class NStepGRU(NStepGRUBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -91,12 +79,6 @@ class NStepBiGRU(NStepGRUBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
diff --git a/chainer/links/connection/n_step_lstm.py b/chainer/links/connection/n_step_lstm.py
index 11654ff85a20..ea76e37102c6 100644
--- a/chainer/links/connection/n_step_lstm.py
+++ b/chainer/links/connection/n_step_lstm.py
@@ -30,12 +30,6 @@ def forward(self, hx, cx, xs, **kwargs):
 
         Calculate all hidden states and cell states.
 
-        .. warning::
-
-           ``train`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', train)``.
-           See :func:`chainer.using_config`.
-
         Args:
             hx (~chainer.Variable or None): Initial hidden states. If ``None``
                 is specified zero-vector is used. Its shape is ``(S, B, N)``
@@ -85,12 +79,6 @@ class NStepLSTM(NStepLSTMBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -126,12 +114,6 @@ class NStepBiLSTM(NStepLSTMBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
diff --git a/chainer/links/connection/n_step_rnn.py b/chainer/links/connection/n_step_rnn.py
index a8eee9397261..8d9d1f98e279 100644
--- a/chainer/links/connection/n_step_rnn.py
+++ b/chainer/links/connection/n_step_rnn.py
@@ -36,12 +36,6 @@ class NStepRNNBase(link.ChainList):
 
     This link's behavior depends on argument, ``use_bi_direction``.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -131,12 +125,6 @@ def forward(self, hx, xs, **kwargs):
 
         Calculate all hidden states and cell states.
 
-        .. warning::
-
-           ``train`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', train)``.
-           See :func:`chainer.using_config`.
-
         Args:
             hx (~chainer.Variable or None): Initial hidden states. If ``None``
                 is specified zero-vector is used. Its shape is ``(S, B, N)``
@@ -227,12 +215,6 @@ class NStepRNNTanh(NStepRNNBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -270,12 +252,6 @@ class NStepRNNReLU(NStepRNNBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -313,12 +289,6 @@ class NStepBiRNNTanh(NStepRNNBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
@@ -356,12 +326,6 @@ class NStepBiRNNReLU(NStepRNNBase):
     Users just need to call the link with a list of :class:`chainer.Variable`
     holding sequences.
 
-    .. warning::
-
-       ``use_cudnn`` argument is not supported anymore since v2.
-       Instead, use ``chainer.using_config('use_cudnn', use_cudnn)``.
-       See :func:`chainer.using_config`.
-
     Args:
         n_layers (int): Number of layers.
         in_size (int): Dimensionality of input vectors.
diff --git a/chainer/links/connection/parameter.py b/chainer/links/connection/parameter.py
index 716c0b98e0ad..e3c3bc3474a9 100644
--- a/chainer/links/connection/parameter.py
+++ b/chainer/links/connection/parameter.py
@@ -8,7 +8,7 @@ class Parameter(link.Link):
     """Link that just holds a parameter and returns it.
 
     .. deprecated:: v1.5
-       The parameters are stored as variables as of v1.5. Use them directly
+       The parameters are stored as variables since v1.5. Use them directly
        instead.
 
     Args:
diff --git a/chainer/links/model/vision/googlenet.py b/chainer/links/model/vision/googlenet.py
index a309e804c051..6c4956aeecc2 100644
--- a/chainer/links/model/vision/googlenet.py
+++ b/chainer/links/model/vision/googlenet.py
@@ -188,12 +188,6 @@ def forward(self, x, layers=None, **kwargs):
 
         Computes all the feature maps specified by ``layers``.
 
-        .. warning::
-
-           ``train`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', train)``.
-           See :func:`chainer.using_config`.
-
         Args:
             x (~chainer.Variable): Input variable. It should be prepared by
                 ``prepare`` function.
@@ -266,25 +260,6 @@ def extract(self, images, layers=None, size=(224, 224), **kwargs):
                  with chainer.using_config('enable_backprop', False):
                      feature = model.extract([image])
 
-        .. warning::
-
-           ``train`` and ``volatile`` arguments are not supported
-           anymore since v2. Instead, users should configure
-           training and volatile modes with ``train`` and
-           ``enable_backprop``, respectively.
-
-           Note that default behavior of this method is different
-           between v1 and later versions. Specifically,
-           the default values of ``train`` arguments in v1 were
-           ``False`` and ``OFF``, while that of
-           ``chainer.config.train`` are ``True``.
-           Therefore, users need to explicitly switch ``train``
-           to ``False`` to run the code in test mode to turn off
-           coputational graph construction.
-
-           See the `upgrade guide <https://docs.chainer.org/en/stable\
-           /upgrade_v2.html#training-mode-is-configured-by-a-thread-local-flag>`_.
-
         Args:
             images (iterable of PIL.Image or numpy.ndarray): Input images.
             layers (list of str): The list of layer names you want to extract.
diff --git a/chainer/links/model/vision/resnet.py b/chainer/links/model/vision/resnet.py
index 5fa28c430b57..1caa51f8b5f2 100644
--- a/chainer/links/model/vision/resnet.py
+++ b/chainer/links/model/vision/resnet.py
@@ -171,12 +171,6 @@ def forward(self, x, layers=None, **kwargs):
 
         Computes all the feature maps specified by ``layers``.
 
-        .. warning::
-
-           ``test`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', train)``.
-           See :func:`chainer.using_config`.
-
         Args:
             x (~chainer.Variable): Input variable. It should be prepared by
                 ``prepare`` function.
@@ -235,25 +229,6 @@ def extract(self, images, layers=None, size=(224, 224), **kwargs):
                  with chainer.using_config('enable_backprop', False):
                      feature = model.extract([image])
 
-        .. warning::
-
-           ``test`` and ``volatile`` arguments are not supported
-           anymore since v2. Instead, users should configure
-           training and volatile modes with ``train`` and
-           ``enable_backprop``, respectively.
-
-           Note that default behavior of this method is different
-           between v1 and later versions. Specifically,
-           the default values of ``test`` in v1 were ``True`` (test mode).
-           But that of ``chainer.config.train`` is also ``True``
-           (train mode). Therefore, users need to explicitly switch
-           ``train`` to ``False`` to run the code in test mode and
-           ``enable_backprop`` to ``False`` to turn off
-           coputational graph construction.
-
-           See the `upgrade guide <https://docs.chainer.org/en/stable\
-           /upgrade_v2.html#training-mode-is-configured-by-a-thread-local-flag>`_.
-
         Args:
             images (iterable of PIL.Image or numpy.ndarray): Input images.
             layers (list of str): The list of layer names you want to extract.
@@ -616,7 +591,7 @@ def __init__(self, in_channels, mid_channels, out_channels,
         # In the original MSRA ResNet, stride=2 is on 1x1 convolution.
         # In Facebook ResNet, stride=2 is on 3x3 convolution.
 
-        stride_1x1, stride_3x3 = (stride, 1) if downsample_fb else (1, stride)
+        stride_1x1, stride_3x3 = (1, stride) if downsample_fb else (stride, 1)
         with self.init_scope():
             self.conv1 = Convolution2D(
                 in_channels, mid_channels, 1, stride_1x1, 0, initialW=initialW,
diff --git a/chainer/links/model/vision/vgg.py b/chainer/links/model/vision/vgg.py
index 6b665c796d22..23700202d260 100644
--- a/chainer/links/model/vision/vgg.py
+++ b/chainer/links/model/vision/vgg.py
@@ -164,13 +164,6 @@ def forward(self, x, layers=None, **kwargs):
 
         Computes all the feature maps specified by ``layers``.
 
-        .. warning::
-
-           ``test`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', False)``
-           to run in test mode.
-           See :func:`chainer.using_config`.
-
         Args:
             x (~chainer.Variable): Input variable. It should be prepared by
                 ``prepare`` function.
@@ -231,25 +224,6 @@ def extract(self, images, layers=None, size=(224, 224), **kwargs):
                  with chainer.using_config('enable_backprop', False):
                      feature = model.extract([image])
 
-        .. warning::
-
-           ``test`` and ``volatile`` arguments are not supported
-           anymore since v2. Instead, users should configure
-           training and volatile modes with ``train`` and
-           ``enable_backprop``, respectively.
-
-           Note that default behavior of this method is different
-           between v1 and later versions. Specifically,
-           the default values of ``test`` in v1 were ``True`` (test mode).
-           But that of ``chainer.config.train`` is also ``True``
-           (train mode). Therefore, users need to explicitly switch
-           ``train`` to ``False`` to run the code in test mode and
-           ``enable_backprop`` to ``False`` to turn off
-           coputational graph construction.
-
-           See the `upgrade guide <https://docs.chainer.org/en/stable\
-           /upgrade_v2.html#training-mode-is-configured-by-a-thread-local-flag>`_.
-
         Args:
             images (iterable of PIL.Image or numpy.ndarray): Input images.
             layers (list of str): The list of layer names you want to extract.
diff --git a/chainer/links/normalization/batch_normalization.py b/chainer/links/normalization/batch_normalization.py
index ec2367bab13c..f979053b7f39 100644
--- a/chainer/links/normalization/batch_normalization.py
+++ b/chainer/links/normalization/batch_normalization.py
@@ -251,12 +251,6 @@ def forward(self, x, **kwargs):
         mean and variance for evaluation during training, and normalizes the
         input using batch statistics.
 
-        .. warning::
-
-           ``test`` argument is not supported anymore since v2.
-           Instead, use ``chainer.using_config('train', False)``.
-           See :func:`chainer.using_config`.
-
         Args:
             x (Variable): Input variable.
             finetune (bool): If it is in the training mode and ``finetune`` is
diff --git a/chainer/reporter.py b/chainer/reporter.py
index 64fed4a10dec..139700af9d96 100644
--- a/chainer/reporter.py
+++ b/chainer/reporter.py
@@ -141,7 +141,7 @@ def report(self, values, observer=None):
         observer object if given.
 
         .. note::
-           As of v2.0.0, if a value is of type :class:`~chainer.Variable`, the
+           If a value is of type :class:`~chainer.Variable`, the
            variable is copied without preserving the computational graph and
            the new variable object purged from the graph is stored to the
            observer. This behavior can be changed by setting
diff --git a/chainer/serializer.py b/chainer/serializer.py
index fec6ca05a60d..fedb611612bc 100644
--- a/chainer/serializer.py
+++ b/chainer/serializer.py
@@ -31,7 +31,7 @@ def __call__(self, key, value):
         ``value`` argument. String values are treated like scalars.
 
         .. note::
-           As of v2.0.0, serializers and deserializers are required to
+           Serializers and deserializers are required to
            correctly handle the ``None`` value. When ``value`` is ``None``,
            serializers save it in format-dependent ways, and deserializers
            just return the loaded value. When the saved ``None`` value is
diff --git a/chainer/testing/array.py b/chainer/testing/array.py
index 2fbef5a8af5f..43c4746aecbd 100644
--- a/chainer/testing/array.py
+++ b/chainer/testing/array.py
@@ -73,8 +73,8 @@ def as_noncontiguous_array(a):
         with chainer.using_device(device):
             ret = xp.empty(
                 (a.shape[0] * 2,) + a.shape[1:], dtype=a.dtype)
-        utils._setitem(ret, slice(None, None, 2), a)  # ret[::2] = a
-        ret = utils._getitem(ret, slice(None, None, 2))  # ret = ret[::2]
+        ret[::2] = a
+        ret = ret[::2]
         if device.xp is chainerx:
             assert not ret.is_contiguous
         else:
diff --git a/chainer/training/extensions/computational_graph.py b/chainer/training/extensions/computational_graph.py
index 5c6b88bb547a..5d3d02a103a4 100644
--- a/chainer/training/extensions/computational_graph.py
+++ b/chainer/training/extensions/computational_graph.py
@@ -20,11 +20,11 @@ def dump_graph(root_name, out_name='cg.dot',
     It only dumps a graph at the first invocation.
 
     .. note::
-       As of v2.0.0, the computational graph is not kept by default. This
+       The computational graph is not kept by default. This
        extension changes this behavior until the first invocation. **It is
        strongly recommended to use it with the default trigger setting.**
 
-       The detailed behavior of this extension since v2.0.0 is as follows.
+       The detailed behavior of this extension is as follows.
 
        1. In its initializer, it turns on the
           ``chainer.config.keep_graph_on_report`` flag.
diff --git a/chainer/utils/__init__.py b/chainer/utils/__init__.py
index 9a1ec5c5e5c8..a1ff964b56bb 100644
--- a/chainer/utils/__init__.py
+++ b/chainer/utils/__init__.py
@@ -8,8 +8,6 @@
 
 import chainer
 # import classes and functions
-from chainer.utils.array import _getitem  # NOQA
-from chainer.utils.array import _setitem  # NOQA
 from chainer.utils.array import size_of_shape  # NOQA
 from chainer.utils.array import sum_to  # NOQA
 from chainer.utils.conv import get_conv_outsize  # NOQA
diff --git a/chainer/utils/array.py b/chainer/utils/array.py
index 955749af2b3f..2e0bbb03f942 100644
--- a/chainer/utils/array.py
+++ b/chainer/utils/array.py
@@ -4,9 +4,7 @@
 import six
 
 import chainer
-from chainer import backend
 from chainer.backends import cuda
-import chainerx
 
 
 def as_vec(x):
@@ -65,99 +63,3 @@ def sum_to(x, shape):
     if lead > 0:
         y = y.squeeze(lead_axis)
     return y
-
-
-# Workaround for chainerx.ndarray advanced indexing.
-# This function is not differentiable.
-# TODO(hvy): Remove this function when chainerx.ndarray.__getitem__ supports
-# advanced indexing.
-def _getitem(arr, key):
-    if not isinstance(arr, chainerx.ndarray):
-        return arr[key]
-
-    try:
-        return arr[key]
-    except (IndexError, chainerx.DimensionError):
-        pass
-
-    is_backprop_required = arr.is_backprop_required()
-
-    arr = backend.from_chainerx(arr)
-    if isinstance(key, chainerx.ndarray):
-        key = backend.from_chainerx(key)
-
-    if isinstance(arr, cuda.ndarray):
-        with arr.device:
-            ret = arr[key]
-    else:
-        ret = arr[key]
-
-    # Doing this check after the fallback __getitem__ because the error which
-    # caused the fallback might not be due to advanced indexing. In such
-    # case the fallback __getitem__ should also raise the error.
-
-    if is_backprop_required:
-        raise RuntimeError(
-            'ChainerX getitem fallback for advanced indexing is not supported '
-            'for arrays that are connected to a graph.')
-
-    return backend.to_chainerx(ret)
-
-
-# Workaround for chainerx.ndarray advanced indexing.
-# This function is not differentiable.
-# TODO(hvy): Remove this function when chainer.ndarray.__setitem__ supports
-# advanced indexing.
-def _setitem(arr, key, value):
-    """Sets arr[key] to value by falling back to a non-ChainerX arrays.
-
-    Supports both basic and advanced indexing.
-
-    Note:
-
-        With the ``cuda`` backend, the behavior differs from NumPy when
-        integer arrays in ``slices`` reference the same location
-        multiple times. In that case, the value that is actually stored
-        is undefined.
-
-        >>> import chainerx
-        >>> chainerx.set_default_device('cuda:0')
-        >>> a = chainerx.zeros((2,), dtype=chainerx.float)
-        >>> i = chainerx.array([0, 1, 0, 1, 0, 1])
-        >>> v = chainerx.arange(6).astype(chainerx.float)
-        >>> a[i] = v
-        >>> a  # doctest: +SKIP
-        array([2., 3.], shape=(2,), dtype=float64, device='cuda:0')
-
-        On the other hand, NumPy and ``native`` backend store the value
-        corresponding to the last index among the indices referencing
-        duplicate locations.
-
-        >>> import numpy
-        >>> a_cpu = numpy.zeros((2,), dtype=numpy.float)
-        >>> i_cpu = numpy.array([0, 1, 0, 1, 0, 1])
-        >>> v_cpu = numpy.arange(6).astype(numpy.float)
-        >>> a_cpu[i_cpu] = v_cpu
-        >>> a_cpu
-        array([4., 5.])
-
-    """
-    if not isinstance(arr, chainerx.ndarray):
-        arr[key] = value
-        return
-
-    if arr.is_backprop_required():
-        raise RuntimeError(
-            'ChainerX setitem fallback for advanced indexing is not supported '
-            'for arrays that are connected to a graph.')
-
-    arr = backend.from_chainerx(arr)
-    if isinstance(key, chainerx.ndarray):
-        key = backend.from_chainerx(key)
-    if isinstance(value, chainerx.ndarray):
-        value = backend.from_chainerx(value)
-    if isinstance(arr, cuda.ndarray):
-        with arr.device:
-            arr[key] = value
-    else:
-        arr[key] = value
diff --git a/chainer/utils/walker_alias.py b/chainer/utils/walker_alias.py
index 8822457ba34c..27c7bba73e52 100644
--- a/chainer/utils/walker_alias.py
+++ b/chainer/utils/walker_alias.py
@@ -108,10 +108,10 @@ def sample_xp(self, xp, shape):
         pb = ps * len(self.threshold)
         index = pb.astype(numpy.int32)
         left_right = (
-            chainer.utils._getitem(self.threshold, index)
+            self.threshold[index]
             < (pb - index.astype(thr_dtype)))
         left_right = left_right.astype(numpy.int32)
-        return chainer.utils._getitem(self.values, index * 2 + left_right)
+        return self.values[index * 2 + left_right]
 
     def sample_gpu(self, shape):
         ps = cuda.cupy.random.uniform(size=shape, dtype=numpy.float32)
diff --git a/chainer/variable.py b/chainer/variable.py
index 88a942c27121..d413699a3878 100644
--- a/chainer/variable.py
+++ b/chainer/variable.py
@@ -22,21 +22,41 @@
 import chainerx
 
 
-def _check_grad_type(func, x, gx):
-    if x.data is None or gx is None:
-        # ``x.data is None`` implies that the data array is not retained
+def _check_grad_type(func, x, is_node_x, gx, is_var_gx):
+    if gx is None:
         return
-    if not chainer.is_arrays_compatible((gx, x.data)):
+    x_grad = gx.array if is_var_gx else gx
+
+    # FIXME: avoid `isinstance`
+    x_data = None if isinstance(x, _ChainerxVariableNodeProps) else x.data
+
+    # TODO(kataoka): Make _update_data_info store the array module.
+    # ``is_node_x and x_data is None`` implies that the data array is not
+    # retained.
+    # ``not is_node_x and x_data is None`` implies that grad of uninitialized
+    # variable is checked here.
+
+    if x_grad is None:
+        # TODO(kataoka): This should be an error.
+        return
+    elif x_data is None and not is_node_x:
+        # TODO(kataoka): This should be an error.
+        return
+    elif not chainer.is_arrays_compatible((x_grad, x_data)):
         msg = ('Type of data and grad mismatch\ngrad: %s != data: %s' %
-               (type(gx), type(x.data)))
+               (type(x_grad), type(x_data)))
         typ = TypeError
-    elif gx.dtype != x.data.dtype:
+    elif x.dtype is None or x.shape is None:
+        # unretained Variable(None)
+        # TODO(kataoka): This should be an error.
+        return
+    elif gx.dtype != x.dtype:
         msg = ('Dtype of data and grad mismatch\ngrad: %s != data: %s' %
-               (gx.dtype, x.data.dtype))
+               (gx.dtype, x.dtype))
         typ = TypeError
-    elif gx.shape != x.data.shape:
+    elif gx.shape != x.shape:
         msg = ('Shape of data and grad mismatch\ngrad: %s != data: %s' %
-               (gx.shape, x.data.shape))
+               (gx.shape, x.shape))
         typ = ValueError
     else:
         return
@@ -54,7 +74,7 @@ def _check_grad_type(func, x, gx):
 Please report this error to the issue tracker with the stack trace,
 the information of your environment, and your script:
 https://github.com/chainer/chainer/issues/new.
-'''.format(type(func).__name__, func.label)
+'''
 
     raise typ(detail + msg)
 
@@ -456,11 +476,6 @@ class Variable(object):
         * Negation (Arithmetic): ``- a`` (:meth:`__neg__`)
         * Absolute value: ``abs(a)`` (:meth:`__abs__`)
 
-    .. warning::
-
-       ``volatile`` argument is not supported anymore since v2.
-       Instead, use :func:`chainer.no_backprop_mode`.
-
     Args:
         data (numpy.ndarray or cupy.ndarray): Initial data array.
         name (str): Name of the variable.
@@ -901,9 +916,7 @@ def grad(self):
 
     @grad.setter
     def grad(self, g):
-        if g is not None:
-            _check_grad_type(None, self, g)
-
+        _check_grad_type(None, self, False, g, False)
         self._set_grad_without_check(g)
 
     def _set_grad_var_without_check(self, gv):
@@ -923,8 +936,7 @@ def grad_var(self):
 
     @grad_var.setter
     def grad_var(self, g):
-        if g is not None:
-            _check_grad_type(None, self, g.array)
+        _check_grad_type(None, self, False, g, True)
         self._set_grad_var_without_check(g)
 
     @property
@@ -1132,12 +1144,13 @@ def cleargrad(self):
     def zerograd(self):
         """Initializes the gradient array by zeros.
 
+
         Note that the gradient variable is unchained from the computational
-        graph by this method because this operation breaks the backprop
+        graph by this method, because this operation breaks the backprop
         validity.
 
         .. deprecated:: v1.15
-           Use :meth:`cleargrad` instead.
+           Use more efficient  :meth:`cleargrads` instead.
 
         """
         warnings.warn(
@@ -1322,21 +1335,6 @@ def backward(self, retain_grad=False, enable_double_backprop=False,
                 arr, enable_double_backprop=enable_double_backprop)
             return
 
-        with chainer.using_config('enable_backprop', enable_double_backprop):
-            self._backward_main(retain_grad, loss_scale)
-
-    def _backward_main(self, retain_grad, loss_scale):
-        # TODO(sonots): Implement for ChainerX
-        if self.xp is chainerx:
-            raise NotImplementedError()
-        self._node._check_old_style_gradient()
-        if self.creator_node is None:
-            return
-
-        cand_funcs = []
-        seen_set = set()
-        grads = _backprop_utils.GradTable(load_if_new=True)
-
         # Initialize error by 1, if this is a loss variable
         if self.array.size == 1 and self.grad_var is None:
             if self.array.ndim != 0:
@@ -1355,86 +1353,9 @@ def _backward_main(self, retain_grad, loss_scale):
                     self.grad = cuda.cupy.ones_like(self.array)
             if loss_scale is not None:
                 self.grad *= loss_scale
-        grads[self._node] = self.grad_var
-
-        def add_cand(cand):
-            if cand not in seen_set:
-                # Negate since heapq is min-heap
-                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
-                seen_set.add(cand)
 
-        add_cand(self.creator_node)
-        leaf_nodes = set()
-
-        while cand_funcs:
-            _, _, func = heapq.heappop(cand_funcs)
-            inputs = func.inputs
-            target_input_indexes = tuple([
-                i for i, x in enumerate(inputs) if x.requires_grad
-            ])
-            outputs = [y() for y in func.outputs]  # access via weak ref
-            out_grad = tuple([grads.pop(y) for y in outputs])
-            if not target_input_indexes:
-                continue
-
-            in_data = tuple([x.data for x in inputs])
-            out_grad_array = tuple(
-                [None if g is None else g.array for g in out_grad])
-            hooks = chainer.get_function_hooks()
-            if func._n_local_function_hooks != 0:
-                hooks = collections.OrderedDict(hooks)
-                hooks.update(func.local_function_hooks)
-            hooks = hooks.values()  # avoid six for performance
-
-            with cuda.get_device_from_array(*(in_data + out_grad_array)):
-                for hook in hooks:
-                    hook.backward_preprocess(func, in_data, out_grad_array)
-
-                # Collect the current input gradients.
-                target_inputs = [inputs[i] for i in target_input_indexes]
-                # Keep the order for the portability, rather than
-                # in_grad = {x: grads.get_as_list(x)
-                #            for x in set(target_inputs)}
-                in_grad = collections.OrderedDict()
-                for x in target_inputs:
-                    if x not in in_grad:
-                        in_grad[x] = grads.get_as_list(x)
-                        # to reduce memory usage
-                        x._set_grad_var_if_available(None)
-
-                _backprop_utils.backprop_step(
-                    func, target_input_indexes, out_grad, in_grad)
-
-                for hook in hooks:
-                    hook.backward_postprocess(func, in_data, out_grad_array)
-
-            for y, gy in six.moves.zip(outputs, out_grad):
-                if y is not None and y is not self.node:
-                    y._set_grad_var_if_available(
-                        gy if retain_grad else None)
-            del gy, out_grad  # to reduce memory usage
-
-            for x, gx in in_grad.items():
-                if not gx:  # gradient == None
-                    continue
-
-                for gx_elem in gx:
-                    _check_grad_type(func, x, gx_elem.array)
-                del gx_elem  # to reduce memory usage
-
-                if x.creator_node is None:  # leaf
-                    leaf_nodes.add(x)
-                else:
-                    add_cand(x.creator_node)
-            del gx, in_grad  # to reduce memory usage
-
-        for x in leaf_nodes:
-            x_var = x.get_variable_or_none()
-            gx = grads.pop(x)
-            if x_var is not None:
-                x_var._set_grad_var_without_check(gx)
-                x_var._loss_scale = loss_scale
-        grads.assert_no_grads()
+        with chainer.using_config('enable_backprop', enable_double_backprop):
+            _backprop_to_all([self], retain_grad, loss_scale)
 
     def reshape(self, *shape):
         """Returns a variable of a different shape and the same content.
@@ -1551,6 +1472,112 @@ def __bool__(self):
     __hash__ = None  # type: tp.Callable[[object], int]
 
 
+def _backprop_to_all(outputs, retain_grad, loss_scale):
+    cand_funcs = []
+    seen_set = set()
+
+    def add_cand(cand):
+        if cand not in seen_set:
+            # Negate since heapq is min-heap
+            heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+            seen_set.add(cand)
+
+    grads = _backprop_utils.GradTable(load_if_new=True)
+
+    root_nodes = set()
+    leaf_nodes = set()
+
+    for y_var in outputs:
+        # TODO(sonots): Implement for ChainerX
+        if y_var.xp is chainerx:
+            raise NotImplementedError()
+
+        y = y_var.node
+        root_nodes.add(y)
+        grads[y] = y_var.grad_var
+
+        y._check_old_style_gradient()
+        func = y.creator_node
+        if func is None:  # leaf
+            leaf_nodes.add(y)
+        else:
+            add_cand(func)
+
+    # Fix F812 (Python 2)
+    y = None
+    del y
+
+    while cand_funcs:
+        _, _, func = heapq.heappop(cand_funcs)
+        inputs = func.inputs
+        target_input_indexes = tuple([
+            i for i, x in enumerate(inputs) if x.requires_grad
+        ])
+        outputs = [y() for y in func.outputs]  # access via weak ref
+        out_grad = tuple([grads.pop(y) for y in outputs])
+        if not target_input_indexes:
+            continue
+
+        in_data = tuple([x.data for x in inputs])
+        out_grad_array = tuple(
+            [None if g is None else g.array for g in out_grad])
+        hooks = chainer.get_function_hooks()
+        if func._n_local_function_hooks != 0:
+            hooks = collections.OrderedDict(hooks)
+            hooks.update(func.local_function_hooks)
+        hooks = hooks.values()  # avoid six for performance
+
+        with cuda.get_device_from_array(*(in_data + out_grad_array)):
+            for hook in hooks:
+                hook.backward_preprocess(func, in_data, out_grad_array)
+
+            # Collect the current input gradients.
+            target_inputs = [inputs[i] for i in target_input_indexes]
+            # Keep the order for the portability, rather than
+            # in_grad = {x: grads.get_as_list(x)
+            #            for x in set(target_inputs)}
+            in_grad = collections.OrderedDict()
+            for x in target_inputs:
+                if x not in in_grad:
+                    in_grad[x] = grads.get_as_list(x)
+                    # to reduce memory usage
+                    x._set_grad_var_if_available(None)
+
+            _backprop_utils.backprop_step(
+                func, target_input_indexes, out_grad, in_grad)
+
+            for hook in hooks:
+                hook.backward_postprocess(func, in_data, out_grad_array)
+
+        for y, gy in six.moves.zip(outputs, out_grad):
+            if y is not None and y not in root_nodes:
+                y._set_grad_var_if_available(
+                    gy if retain_grad else None)
+        del gy, out_grad  # to reduce memory usage
+
+        for x, gx in in_grad.items():
+            if not gx:  # gradient == None
+                continue
+
+            for gx_elem in gx:
+                _check_grad_type(func, x, True, gx_elem, True)
+            del gx_elem  # to reduce memory usage
+
+            if x.creator_node is None:  # leaf
+                leaf_nodes.add(x)
+            else:
+                add_cand(x.creator_node)
+        del gx, in_grad  # to reduce memory usage
+
+    for x in leaf_nodes:
+        x_var = x.get_variable_or_none()
+        gx = grads.pop(x)
+        if x_var is not None:
+            x_var._set_grad_var_without_check(gx)
+            x_var._loss_scale = loss_scale
+    grads.assert_no_grads()
+
+
 class Parameter(Variable):
 
     """Parameter variable that can be registered to a link.
diff --git a/chainermn/functions/batch_normalization.py b/chainermn/functions/batch_normalization.py
index 6b34bc14fb53..a7ce5f9af35a 100644
--- a/chainermn/functions/batch_normalization.py
+++ b/chainermn/functions/batch_normalization.py
@@ -13,15 +13,6 @@
 import six
 
 
-def _as4darray(arr):
-    if arr.ndim == 0:
-        return arr.reshape(1, 1, 1, 1)
-    elif arr.ndim == 4:
-        return arr
-    else:
-        return arr.reshape(arr.shape[0], -1, 1, 1)
-
-
 def _xhat(x, mean, std, expander):
     x_mu = x - mean[expander]
     x_mu /= std[expander]
diff --git a/chainerx/__init__.py b/chainerx/__init__.py
index a431d4426f1b..81ce6b5ccc1c 100644
--- a/chainerx/__init__.py
+++ b/chainerx/__init__.py
@@ -38,13 +38,17 @@
     _global_context = _core.Context()
     _core.set_global_default_context(_global_context)
 
+    # Implements ndarray methods in Python
     from chainerx import _ndarray
-    from chainerx import _docs
-
-    # Add workaround implementation for NumPy ndarray-compatible functions
     _ndarray.populate()
 
+    # Temporary workaround implementations that fall back to NumPy/CuPy's
+    # respective functions.
+    from chainerx import _fallback_workarounds
+    _fallback_workarounds.populate()
+
     # Dynamically inject docstrings
+    from chainerx import _docs
     _docs.set_docs()
 
 else:
diff --git a/chainerx/_docs/__init__.py b/chainerx/_docs/__init__.py
index d569f485d31b..f1f5a4dd43ca 100644
--- a/chainerx/_docs/__init__.py
+++ b/chainerx/_docs/__init__.py
@@ -10,7 +10,8 @@
 
 
 def set_doc(obj, docstring):
-    if inspect.ismethoddescriptor(obj) or inspect.isroutine(obj):
+    if ((inspect.ismethoddescriptor(obj) or inspect.isroutine(obj))
+            and not inspect.isfunction(obj)):
         # pybind-generated functions and methods
         _core._set_pybind_doc(obj, docstring)
         return
diff --git a/chainerx/_fallback_workarounds.py b/chainerx/_fallback_workarounds.py
new file mode 100644
index 000000000000..0b665a184622
--- /dev/null
+++ b/chainerx/_fallback_workarounds.py
@@ -0,0 +1,141 @@
+# This file defines workaround implementation for
+# NumPy-compatibility functions that fall back to NumPy/CuPy functions
+# for native/cuda devices respecitvely.
+# The workaround does not support backprop, and also requires external
+# libraries mentioned above.
+# Functions defined in this file should be considered to have high priority for
+# genuine implementations.
+
+import numpy
+
+import chainerx
+
+
+try:
+    import cupy
+except Exception:
+    cupy = None
+
+
+def _to_numpy(array):
+    assert isinstance(array, chainerx.ndarray)
+    return chainerx.to_numpy(array, copy=False)
+
+
+def _from_numpy(array):
+    assert isinstance(array, numpy.ndarray)
+    return chainerx.array(array, copy=False)
+
+
+def _to_cupy(array):
+    assert cupy is not None
+    # Convert to cupy.ndarray on the same device as source array
+    return cupy.ndarray(
+        array.shape,
+        array.dtype,
+        cupy.cuda.MemoryPointer(
+            cupy.cuda.UnownedMemory(
+                array.data_ptr + array.offset,
+                array.data_size,
+                array,
+                array.device.index),
+            0),
+        strides=array.strides)
+
+
+def _from_cupy(array):
+    assert cupy is not None
+    assert isinstance(array, cupy.ndarray)
+    device = chainerx.get_device('cuda', array.device.id)
+    return chainerx._core._fromrawpointer(
+        array.data.mem.ptr,
+        array.shape,
+        array.dtype,
+        array.strides,
+        device,
+        array.data.ptr - array.data.mem.ptr,
+        array)
+
+
+def _from_chainerx(array):
+    # Converts chainerx.ndarray to numpy/cupy.ndarray.
+    # Objects with other types are kept intact.
+    if not isinstance(array, chainerx.ndarray):
+        return array
+    backend_name = array.device.backend.name
+    if backend_name == 'native':
+        return _to_numpy(array)
+    if backend_name == 'cuda':
+        if cupy is None:
+            raise RuntimeError(
+                'ChainerX fallback implementation for cuda backend requires '
+                'cupy to be installed.')
+        return _to_cupy(array)
+    raise RuntimeError(
+        'ChainerX fallback implementation only supports native or cuda '
+        'backends.')
+
+
+def _to_chainerx(array):
+    # Converts numpy/cupy.ndarray to chainerx.ndarray.
+    # Objects with other types are kept intact.
+    if isinstance(array, numpy.ndarray):
+        return _from_numpy(array)
+    elif cupy is not None and isinstance(array, cupy.ndarray):
+        return _from_cupy(array)
+    return array
+
+
+def populate():
+    ndarray = chainerx.ndarray
+
+    # __getitem__ with advanced indexing
+    old_getitem = ndarray.__getitem__
+
+    def __getitem__(arr, key):
+        try:
+            return old_getitem(arr, key)
+        except (IndexError, chainerx.DimensionError):
+            pass
+
+        is_backprop_required = arr.is_backprop_required()
+
+        arr = _from_chainerx(arr)
+        key = _from_chainerx(key)
+
+        if cupy is not None and isinstance(arr, cupy.ndarray):
+            with arr.device:
+                ret = arr[key]
+        else:
+            ret = arr[key]
+
+        # Doing this check after the fallback __getitem__ because the error
+        # which caused the fallback might not be due to advanced indexing.
+        # In such case the fallback __getitem__ should also raise the error.
+
+        if is_backprop_required:
+            raise RuntimeError(
+                'ChainerX getitem fallback for advanced indexing is not '
+                'supported for arrays that are connected to a graph.')
+
+        return _to_chainerx(ret)
+
+    # __setitem__ with advanced indexing
+    def __setitem__(self, key, value):
+        if self.is_backprop_required():
+            raise RuntimeError(
+                'ChainerX setitem fallback for advanced indexing is not '
+                'supported for arrays that are connected to a graph.')
+
+        self = _from_chainerx(self)
+        key = _from_chainerx(key)
+        value = _from_chainerx(value)
+
+        if cupy is not None and isinstance(self, cupy.ndarray):
+            with self.device:
+                self[key] = value
+        else:
+            self[key] = value
+
+    ndarray.__setitem__ = __setitem__
+    ndarray.__getitem__ = __getitem__
diff --git a/chainerx/_ndarray.py b/chainerx/_ndarray.py
index 06752f1a6a63..c1d724c527f8 100644
--- a/chainerx/_ndarray.py
+++ b/chainerx/_ndarray.py
@@ -1,11 +1,9 @@
-# This file defines inefficient workaround implementation for
-# NumPy ndarray-compatibility functions. This file should ultimately be
-# emptied by implementing those functions in more efficient manner.
+# This file implements chainerx.ndarray methods that can be defined only in
+# Python.
 
 import chainerx
 
 
-# Populates chainerx.ndarray methods in the chainerx namespace
 def populate():
 
     def clip(self, a_min, a_max):
diff --git a/chainerx_cc/.clang-tidy b/chainerx_cc/.clang-tidy
index a743a702dc69..0d3fc8064da5 100644
--- a/chainerx_cc/.clang-tidy
+++ b/chainerx_cc/.clang-tidy
@@ -1,4 +1,4 @@
-Checks:          '-*,boost-*,bugprone-*,cert-*,cppcoreguidelines-*,clang-analyzer-*,google-*,misc-*,modernize-*,performance-*,readability-*,-google-runtime-references,-cppcoreguidelines-pro-bounds-pointer-arithmetic'
+Checks:          '-*,boost-*,bugprone-*,cert-*,cppcoreguidelines-*,clang-analyzer-*,google-*,misc-*,modernize-*,performance-*,readability-*,-google-runtime-references,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-modernize-use-auto'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '^chainerx/chainerx/.*'
 AnalyzeTemporaryDtors: false
diff --git a/chainerx_cc/LICENSE.txt b/chainerx_cc/LICENSE.txt
index 22731619cb7a..f51c0c0c2144 100644
--- a/chainerx_cc/LICENSE.txt
+++ b/chainerx_cc/LICENSE.txt
@@ -94,3 +94,73 @@ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
+
+######################################################################
+# ChainerX borrowed the following functions from NumPy.
+# - npy_floatbits_to_halfbits,
+# - npy_doublebits_to_halfbits,
+# - npy_halfbits_to_floatbits,
+# - npy_halfbits_to_doublebits,
+# https://github.com/numpy/numpy/blob/04c2f33843782cd27a16f919477b1b27c4b01e5a/numpy/core/src/npymath/halffloat.c
+# It is included in chainerx/native/native_float16.cc.
+######################################################################
+Copyright (c) 2005-2017, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+The NumPy repository and source distributions bundle several libraries that are
+compatibly licensed.  We list these here.
+
+Name: Numpydoc
+Files: doc/sphinxext/numpydoc/*
+License: 2-clause BSD
+  For details, see doc/sphinxext/LICENSE.txt
+
+Name: scipy-sphinx-theme
+Files: doc/scipy-sphinx-theme/*
+License: 3-clause BSD, PSF and Apache 2.0
+  For details, see doc/scipy-sphinx-theme/LICENSE.txt
+
+Name: lapack-lite
+Files: numpy/linalg/lapack_lite/*
+License: 3-clause BSD
+  For details, see numpy/linalg/lapack_lite/LICENSE.txt
+
+Name: tempita
+Files: tools/npy_tempita/*
+License: BSD derived
+  For details, see tools/npy_tempita/license.txt
+
+Name: dragon4
+Files: numpy/core/src/multiarray/dragon4.c
+License: One of a kind
+  For license text, see numpy/core/src/multiarray/dragon4.c
diff --git a/chainerx_cc/chainerx/CMakeLists.txt b/chainerx_cc/chainerx/CMakeLists.txt
index 3363fdf4683a..b921f2fea9a4 100644
--- a/chainerx_cc/chainerx/CMakeLists.txt
+++ b/chainerx_cc/chainerx/CMakeLists.txt
@@ -34,6 +34,7 @@ install(FILES
     dtype.h
     enum.h
     error.h
+    float16.h
     graph.h
     hash_combine.h
     index_iterator.h
@@ -73,6 +74,7 @@ add_library(chainerx SHARED
     device.cc
     device_id.cc
     dtype.cc
+    float16.cc
     graph.cc
     numeric.cc
     numerical_gradient.cc
@@ -116,6 +118,7 @@ if(${CHAINERX_BUILD_TEST})
         context_test.cc
         device_test.cc
         dtype_test.cc
+        float16_test.cc
         index_iterator_test.cc
         indexable_array_test.cc
         indexer_test.cc
diff --git a/chainerx_cc/chainerx/cuda/cuda_conv_test.cc b/chainerx_cc/chainerx/cuda/cuda_conv_test.cc
index adb11890f07f..159219023583 100644
--- a/chainerx_cc/chainerx/cuda/cuda_conv_test.cc
+++ b/chainerx_cc/chainerx/cuda/cuda_conv_test.cc
@@ -47,8 +47,8 @@ TEST(CudaConvTest, FwdAlgoCache) {
     std::copy(kernel_size.begin(), kernel_size.end(), std::back_inserter(w_shape));
     Shape b_shape{out_channels};
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f);
     Array b = testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f});
 
     // New parameters should create new auto tuning caches, and same parameters should not.
@@ -93,8 +93,8 @@ TEST(CudaConvTest, BwdDatadAlgoCache) {
     std::copy(kernel_size.begin(), kernel_size.end(), std::back_inserter(w_shape));
     Shape b_shape{out_channels};
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f);
     Array b = testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f});
 
     // New parameters should create new auto tuning caches, and same parameters should not.
@@ -137,7 +137,7 @@ TEST(CudaConvTest, BwdFilterAlgoCache) {
     Shape w_shape{out_channels, in_channels};
     std::copy(kernel_size.begin(), kernel_size.end(), std::back_inserter(w_shape));
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
 
     // New parameters should create new auto tuning caches, and same parameters should not.
     // ConvGradW is not exposed as routines function, so call CudaDevice::ConvGradWeight directly.
diff --git a/chainerx_cc/chainerx/float16.cc b/chainerx_cc/chainerx/float16.cc
new file mode 100644
index 000000000000..13d74c7fe7de
--- /dev/null
+++ b/chainerx_cc/chainerx/float16.cc
@@ -0,0 +1,243 @@
+#include "chainerx/float16.h"
+
+#include <cstdint>
+
+namespace chainerx {
+namespace {
+
+union UnionFloatUint {
+public:
+    explicit UnionFloatUint(float v) : f{v} {}
+    explicit UnionFloatUint(uint32_t v) : i{v} {}
+    float f;
+    uint32_t i;
+};
+
+union UnionDoubleUint {
+public:
+    explicit UnionDoubleUint(double v) : f{v} {}
+    explicit UnionDoubleUint(uint64_t v) : i{v} {}
+    double f;
+    uint64_t i;
+};
+
+// Borrowed from npy_floatbits_to_halfbits
+//
+// See LICENSE.txt of ChainerX.
+uint16_t FloatbitsToHalfbits(uint32_t f) {
+    uint16_t h_sgn = static_cast<uint16_t>((f & 0x80000000U) >> 16);
+    uint32_t f_exp = (f & 0x7f800000U);
+
+    // Exponent overflow/NaN converts to signed inf/NaN
+    if (f_exp >= 0x47800000U) {
+        if (f_exp != 0x7f800000U) {
+            // Overflow to signed inf
+            return h_sgn + 0x7c00U;
+        }
+
+        uint32_t f_sig = (f & 0x007fffffU);
+        if (f_sig == 0) {
+            // Signed inf
+            return h_sgn + 0x7c00U;
+        }
+
+        // NaN - propagate the flag in the significand...
+        uint16_t ret = static_cast<uint16_t>(0x7c00U + (f_sig >> 13));
+
+        // ...but make sure it stays a NaN
+        if (ret == 0x7c00U) {
+            ++ret;
+        }
+        return h_sgn + ret;
+    }
+
+    // Exponent underflow converts to a subnormal half or signed zero
+    if (f_exp <= 0x38000000U) {
+        if (f_exp < 0x33000000U) {
+            // Signed zero
+            return h_sgn;
+        }
+
+        // Make the subnormal significand
+        f_exp >>= 23;
+        uint32_t f_sig = (0x00800000U + (f & 0x007fffffU)) >> (113 - f_exp);
+
+        // Handle rounding by adding 1 to the bit beyond half precision
+        f_sig += 0x00001000U;
+        uint16_t h_sig = static_cast<uint16_t>(f_sig >> 13);
+
+        // If the rounding causes a bit to spill into h_exp, it will increment h_exp from zero to one and h_sig will be zero. This is the
+        // correct result.
+        return h_sgn + h_sig;
+    }
+
+    // Regular case with no overflow or underflow
+    uint16_t h_exp = static_cast<uint16_t>((f_exp - 0x38000000U) >> 13);
+
+    // Handle rounding by adding 1 to the bit beyond half precision
+    uint32_t f_sig = (f & 0x007fffffU) + 0x00001000U;
+    uint16_t h_sig = static_cast<uint16_t>(f_sig >> 13);
+
+    // If the rounding causes a bit to spill into h_exp, it will increment h_exp by one and h_sig will be zero. This is the correct result.
+    // h_exp may increment to 15, at greatest, in which case the result overflows to a signed inf.
+    return h_sgn + h_exp + h_sig;
+}
+
+// Borrowed from npy_doublebits_to_halfbits
+//
+// See LICENSE.txt of ChainerX.
+uint16_t DoublebitsToHalfbits(uint64_t d) {
+    uint16_t h_sgn = (d & 0x8000000000000000ULL) >> 48;
+    uint64_t d_exp = (d & 0x7ff0000000000000ULL);
+
+    // Exponent overflow/NaN converts to signed inf/NaN
+    if (d_exp >= 0x40f0000000000000ULL) {
+        if (d_exp != 0x7ff0000000000000ULL) {
+            // Overflow to signed inf
+            return h_sgn + 0x7c00U;
+        }
+
+        uint64_t d_sig = (d & 0x000fffffffffffffULL);
+        if (d_sig == 0) {
+            // Signed inf
+            return h_sgn + 0x7c00U;
+        }
+
+        // NaN - propagate the flag in the significand...
+        uint16_t ret = static_cast<uint16_t>(0x7c00U + (d_sig >> 42));
+
+        // ...but make sure it stays a NaN
+        if (ret == 0x7c00U) {
+            ++ret;
+        }
+        return h_sgn + ret;
+    }
+
+    // Exponent underflow converts to subnormal half or signed zero
+    if (d_exp <= 0x3f00000000000000ULL) {
+        if (d_exp < 0x3e60000000000000ULL) {
+            // Signed zero
+            return h_sgn;
+        }
+
+        // Make the subnormal significand
+        d_exp >>= 52;
+        uint64_t d_sig = (0x0010000000000000ULL + (d & 0x000fffffffffffffULL)) >> (1009 - d_exp);
+
+        // Handle rounding by adding 1 to the bit beyond half precision
+        d_sig += 0x0000020000000000ULL;
+        uint16_t h_sig = static_cast<uint16_t>(d_sig >> 42);
+
+        // If the rounding causes a bit to spill into h_exp, it will increment h_exp from zero to one and h_sig will be zero. This is the
+        // correct result.
+        return h_sgn + h_sig;
+    }
+
+    // Regular case with no overflow or underflow
+    uint16_t h_exp = static_cast<uint16_t>((d_exp - 0x3f00000000000000ULL) >> 42);
+
+    // Handle rounding by adding 1 to the bit beyond half precision
+    uint64_t d_sig = (d & 0x000fffffffffffffULL) + 0x0000020000000000ULL;
+    uint16_t h_sig = static_cast<uint16_t>(d_sig >> 42);
+
+    // If the rounding causes a bit to spill into h_exp, it will increment h_exp by one and h_sig will be zero. This is the correct result.
+    // h_exp may increment to 15, at greatest, in which case the result overflows to a signed inf.
+    return h_sgn + h_exp + h_sig;
+}
+
+// Borrowed from npy_halfbits_to_floatbits
+//
+// See LICENSE.txt of ChainerX.
+uint32_t HalfbitsToFloatbits(uint16_t h) {
+    uint16_t h_exp = (h & 0x7c00U);
+    uint32_t f_sgn = (static_cast<uint32_t>(h) & 0x8000U) << 16;
+    switch (h_exp) {
+        case 0x0000U: {  // 0 or subnormal
+            uint16_t h_sig = (h & 0x03ffU);
+
+            // Signed zero
+            if (h_sig == 0) {
+                return f_sgn;
+            }
+
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig & 0x0400U) == 0) {
+                h_sig <<= 1;
+                ++h_exp;
+            }
+
+            uint32_t f_exp = (static_cast<uint32_t>(127 - 15 - h_exp)) << 23;
+            uint32_t f_sig = (static_cast<uint32_t>(h_sig & 0x03ffU)) << 13;
+            return f_sgn + f_exp + f_sig;
+        }
+        case 0x7c00U: {  // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return f_sgn + 0x7f800000U + ((static_cast<uint32_t>(h & 0x03ffU)) << 13);
+        }
+        default: {  // normalized
+            // Just need to adjust the exponent and shift
+            return f_sgn + ((static_cast<uint32_t>(h & 0x7fffU) + 0x1c000U) << 13);
+        }
+    }
+}
+
+// Borrowed from npy_halfbits_to_doublebits
+//
+// See LICENSE.txt of ChainerX.
+uint64_t HalfbitsToDoublebits(uint16_t h) {
+    uint16_t h_exp = (h & 0x7c00U);
+    uint64_t d_sgn = (static_cast<uint64_t>(h) & 0x8000U) << 48;
+    switch (h_exp) {
+        case 0x0000U: {  // 0 or subnormal
+            uint16_t h_sig = (h & 0x03ffU);
+
+            // Signed zero
+            if (h_sig == 0) {
+                return d_sgn;
+            }
+
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig & 0x0400U) == 0) {
+                h_sig <<= 1;
+                ++h_exp;
+            }
+
+            uint64_t d_exp = (static_cast<uint64_t>(1023 - 15 - h_exp)) << 52;
+            uint64_t d_sig = (static_cast<uint64_t>(h_sig & 0x03ffU)) << 42;
+            return d_sgn + d_exp + d_sig;
+        }
+        case 0x7c00U: {  // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return d_sgn + 0x7ff0000000000000ULL + ((static_cast<uint64_t>(h & 0x03ffU)) << 42);
+        }
+        default: {  // normalized
+            // Just need to adjust the exponent and shift
+            return d_sgn + ((static_cast<uint64_t>(h & 0x7fffU) + 0xfc000U) << 42);
+        }
+    }
+}
+
+uint16_t FloatToHalf(float v) {
+    return FloatbitsToHalfbits(UnionFloatUint(v).i);  // NOLINT(cppcoreguidelines-pro-type-union-access)
+}
+uint16_t DoubleToHalf(double v) {
+    return DoublebitsToHalfbits(UnionDoubleUint(v).i);  // NOLINT(cppcoreguidelines-pro-type-union-access)
+}
+float HalfToFloat(uint16_t v) {
+    return UnionFloatUint(HalfbitsToFloatbits(v)).f;  // NOLINT(cppcoreguidelines-pro-type-union-access)
+}
+double HalfToDouble(uint16_t v) {
+    return UnionDoubleUint(HalfbitsToDoublebits(v)).f;  // NOLINT(cppcoreguidelines-pro-type-union-access)
+}
+
+}  // namespace
+
+Half::Half(float v) : data_{FloatToHalf(v)} {}
+Half::Half(double v) : data_{DoubleToHalf(v)} {}
+
+Half::operator float() const { return HalfToFloat(data_); }
+Half::operator double() const { return HalfToDouble(data_); }
+
+}  // namespace chainerx
diff --git a/chainerx_cc/chainerx/float16.h b/chainerx_cc/chainerx/float16.h
new file mode 100644
index 000000000000..5a444c831ae5
--- /dev/null
+++ b/chainerx_cc/chainerx/float16.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <cstdint>
+
+namespace chainerx {
+
+class Half {
+private:
+    struct FromDataTag {};
+
+public:
+    Half() {}
+    explicit Half(float v);
+    explicit Half(double v);
+
+    explicit Half(bool v) : Half{static_cast<float>(v)} {}
+    explicit Half(int16_t v) : Half{static_cast<float>(v)} {}
+    explicit Half(uint16_t v) : Half{static_cast<float>(v)} {}
+    explicit Half(int32_t v) : Half{static_cast<double>(v)} {}
+    explicit Half(uint32_t v) : Half{static_cast<double>(v)} {}
+    explicit Half(int64_t v) : Half{static_cast<double>(v)} {}
+    explicit Half(uint64_t v) : Half{static_cast<double>(v)} {}
+
+    explicit operator float() const;
+    explicit operator double() const;
+
+    explicit operator bool() const { return static_cast<float>(*this); }
+    explicit operator int16_t() const { return static_cast<float>(*this); }
+    explicit operator uint16_t() const { return static_cast<float>(*this); }
+    explicit operator int32_t() const { return static_cast<double>(*this); }
+    explicit operator uint32_t() const { return static_cast<double>(*this); }
+    explicit operator int64_t() const { return static_cast<double>(*this); }
+    explicit operator uint64_t() const { return static_cast<double>(*this); }
+    explicit operator signed char() const { return static_cast<float>(*this); }
+    explicit operator unsigned char() const { return static_cast<float>(*this); }
+
+    bool operator==(const Half& r) const { return static_cast<float>(*this) == static_cast<float>(r); }
+    bool operator!=(const Half& r) const { return static_cast<float>(*this) != static_cast<float>(r); }
+    bool operator<(const Half& r) const { return static_cast<float>(*this) < static_cast<float>(r); }
+    bool operator>(const Half& r) const { return static_cast<float>(*this) > static_cast<float>(r); }
+    bool operator<=(const Half& r) const { return static_cast<float>(*this) <= static_cast<float>(r); }
+    bool operator>=(const Half& r) const { return static_cast<float>(*this) >= static_cast<float>(r); }
+    Half operator-() const { return Half{-static_cast<float>(*this)}; }
+    Half operator+(const Half& r) const { return Half{static_cast<float>(*this) + static_cast<float>(r)}; }
+    Half operator-(const Half& r) const { return Half{static_cast<float>(*this) - static_cast<float>(r)}; }
+    Half operator*(const Half& r) const { return Half{static_cast<float>(*this) * static_cast<float>(r)}; }
+    Half operator/(const Half& r) const { return Half{static_cast<float>(*this) / static_cast<float>(r)}; }
+    Half& operator+=(const Half& r) { return *this = *this + r; }
+    Half& operator-=(const Half& r) { return *this = *this - r; }
+    Half& operator*=(const Half& r) { return *this = *this * r; }
+    Half& operator/=(const Half& r) { return *this = *this / r; }
+
+    uint16_t data() const { return data_; }
+    static Half FromData(uint16_t data) { return Half{data, FromDataTag{}}; }
+
+private:
+    explicit Half(uint16_t data, FromDataTag) : data_{data} {}
+    uint16_t data_;
+};
+}  // namespace chainerx
diff --git a/chainerx_cc/chainerx/float16_test.cc b/chainerx_cc/chainerx/float16_test.cc
new file mode 100644
index 000000000000..920fba72cc53
--- /dev/null
+++ b/chainerx_cc/chainerx/float16_test.cc
@@ -0,0 +1,239 @@
+#include "chainerx/float16.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace chainerx {
+namespace {
+
+bool IsNan(const Half& half) {
+    uint16_t exp = half.data() & 0x7c00;
+    uint16_t frac = half.data() & 0x03ff;
+    return exp == 0x7c00 && frac != 0x0000;
+}
+
+// Checks if `d` is equal to FromHalf(ToHalf(d)) with tolerance `tol`.
+// This function cannot take NaN as a parameter.  The cast of NaN is tested in `Float16Nan`.
+void CheckToHalfFromHalfNear(double d, double tol) {
+    Half half{d};
+    Half half_float{static_cast<float>(d)};
+    EXPECT_EQ(half.data(), half_float.data());
+    float f_result = static_cast<float>(half);
+    double d_result = static_cast<double>(half);
+
+    ASSERT_FALSE(std::isnan(d));
+    EXPECT_FALSE(std::isnan(f_result));
+    EXPECT_FALSE(std::isnan(d_result));
+    EXPECT_FALSE(IsNan(half));
+
+    if (std::isinf(d)) {
+        // Signed inf
+        EXPECT_EQ(d, f_result);
+        EXPECT_EQ(d, d_result);
+    } else {
+        // Absolute error or relative error should be less or equal to tol.
+        tol = std::max(tol, tol * std::abs(d));
+        EXPECT_NEAR(d, f_result, tol);
+        EXPECT_NEAR(d, d_result, tol);
+    }
+}
+
+// Checks if `h` is equal to ToHalf(FromHalf(h)) exactly.
+// This function cannot take NaN as a parameter.  The cast of NaN is tested in `Float16Nan`.
+void CheckFromHalfToHalfEq(const Half& half) {
+    float f = static_cast<float>(half);
+    double d = static_cast<double>(half);
+    EXPECT_EQ(d, static_cast<double>(f));
+
+    ASSERT_FALSE(IsNan(half));
+    EXPECT_FALSE(std::isnan(f));
+    EXPECT_FALSE(std::isnan(d));
+
+    EXPECT_EQ(half.data(), Half{f}.data());
+    EXPECT_EQ(half.data(), Half{d}.data());
+}
+
+TEST(NativeFloat16Test, Float16Zero) {
+    EXPECT_EQ(Half{float{0.0}}.data(), 0x0000);
+    EXPECT_EQ(Half{float{-0.0}}.data(), 0x8000);
+    EXPECT_EQ(Half{double{0.0}}.data(), 0x0000);
+    EXPECT_EQ(Half{double{-0.0}}.data(), 0x8000);
+    EXPECT_EQ(static_cast<float>(Half::FromData(0x0000)), 0.0);
+    EXPECT_EQ(static_cast<float>(Half::FromData(0x8000)), -0.0);
+    EXPECT_EQ(static_cast<double>(Half::FromData(0x0000)), 0.0);
+    EXPECT_EQ(static_cast<double>(Half::FromData(0x8000)), -0.0);
+    // Checks if the value is casted to 0.0 or -0.0
+    EXPECT_EQ(1 / static_cast<float>(Half::FromData(0x0000)), std::numeric_limits<float>::infinity());
+    EXPECT_EQ(1 / static_cast<float>(Half::FromData(0x8000)), -std::numeric_limits<float>::infinity());
+    EXPECT_EQ(1 / static_cast<double>(Half::FromData(0x0000)), std::numeric_limits<float>::infinity());
+    EXPECT_EQ(1 / static_cast<double>(Half::FromData(0x8000)), -std::numeric_limits<float>::infinity());
+}
+
+TEST(NativeFloat16Test, Float16Normalized) {
+    for (double x = 1e-3; x < 1e3; x *= 1.01) {  // NOLINT(clang-analyzer-security.FloatLoopCounter)
+        EXPECT_NE(Half{x}.data() & 0x7c00, 0);
+        CheckToHalfFromHalfNear(x, 1e-3);
+        CheckToHalfFromHalfNear(-x, 1e-3);
+    }
+    for (uint16_t bit = 0x0400; bit < 0x7c00; ++bit) {
+        CheckFromHalfToHalfEq(Half::FromData(bit | 0x0000));
+        CheckFromHalfToHalfEq(Half::FromData(bit | 0x8000));
+    }
+}
+
+TEST(NativeFloat16Test, Float16Denormalized) {
+    for (double x = 1e-7; x < 1e-5; x += 1e-7) {  // NOLINT(clang-analyzer-security.FloatLoopCounter)
+        // Check if the underflow gap around zero is filled with denormal number.
+        EXPECT_EQ(Half{x}.data() & 0x7c00, 0x0000);
+        EXPECT_NE(Half{x}.data() & 0x03ff, 0x0000);
+        CheckToHalfFromHalfNear(x, 1e-7);
+        CheckToHalfFromHalfNear(-x, 1e-7);
+    }
+    for (uint16_t bit = 0x0000; bit < 0x0400; ++bit) {
+        CheckFromHalfToHalfEq(Half::FromData(bit | 0x0000));
+        CheckFromHalfToHalfEq(Half::FromData(bit | 0x8000));
+    }
+}
+
+TEST(NativeFloat16Test, Float16Inf) {
+    EXPECT_EQ(Half{std::numeric_limits<float>::infinity()}.data(), 0x7c00);
+    EXPECT_EQ(Half{-std::numeric_limits<float>::infinity()}.data(), 0xfc00);
+    EXPECT_EQ(Half{std::numeric_limits<double>::infinity()}.data(), 0x7c00);
+    EXPECT_EQ(Half{-std::numeric_limits<double>::infinity()}.data(), 0xfc00);
+    EXPECT_EQ(std::numeric_limits<float>::infinity(), static_cast<float>(Half::FromData(0x7c00)));
+    EXPECT_EQ(-std::numeric_limits<float>::infinity(), static_cast<float>(Half::FromData(0xfc00)));
+    EXPECT_EQ(std::numeric_limits<double>::infinity(), static_cast<double>(Half::FromData(0x7c00)));
+    EXPECT_EQ(-std::numeric_limits<double>::infinity(), static_cast<double>(Half::FromData(0xfc00)));
+}
+
+TEST(NativeFloat16Test, Float16Nan) {
+    for (uint16_t bit = 0x7c01; bit < 0x8000; ++bit) {
+        EXPECT_TRUE(std::isnan(static_cast<float>(Half::FromData(bit | 0x0000))));
+        EXPECT_TRUE(std::isnan(static_cast<float>(Half::FromData(bit | 0x8000))));
+        EXPECT_TRUE(std::isnan(static_cast<double>(Half::FromData(bit | 0x0000))));
+        EXPECT_TRUE(std::isnan(static_cast<double>(Half::FromData(bit | 0x8000))));
+    }
+    EXPECT_TRUE(IsNan(Half{float{NAN}}));
+    EXPECT_TRUE(IsNan(Half{double{NAN}}));
+}
+
+// Get the partial set of all Half values for reduction of test execution time.
+// The returned list includes the all values whose trailing 8 digits are `0b00000000` or `0b01010101`.
+// This list includes all special values (e.g. signed zero, infinity) and some of normalized/denormalize numbers and NaN.
+std::vector<Half> GetFloat16Values() {
+    std::vector<Half> half_values;
+    half_values.reserve(1 << 9);
+    // Use uint32_t instead of uint16_t to avoid overflow
+    for (uint32_t bit = 0x0000; bit <= 0xffff; bit += 0x0100) {
+        half_values.emplace_back(Half::FromData(bit | 0x0000));
+        half_values.emplace_back(Half::FromData(bit | 0x0055));
+    }
+    return half_values;
+}
+
+// Checks if `l` is equal to `r` or both of them are NaN.
+void ExpectEqFloat16(const Half& l, const Half& r) {
+    if (IsNan(l) && IsNan(r)) {
+        return;
+    }
+    EXPECT_EQ(l.data(), r.data());
+}
+
+TEST(NativeFloat16Test, Float16Neg) {
+    // Use uint32_t instead of uint16_t to avoid overflow
+    for (uint32_t bit = 0x0000; bit <= 0xffff; ++bit) {
+        Half x = Half::FromData(bit);
+        Half expected{-static_cast<double>(x)};
+        ExpectEqFloat16(expected, -x);
+    }
+}
+
+TEST(NativeFloat16Test, Float16Add) {
+    for (const Half& x : GetFloat16Values()) {
+        for (const Half& y : GetFloat16Values()) {
+            Half expected{static_cast<double>(x) + static_cast<double>(y)};
+            ExpectEqFloat16(expected, x + y);
+            ExpectEqFloat16(expected, y + x);
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16Subtract) {
+    for (const Half& x : GetFloat16Values()) {
+        for (const Half& y : GetFloat16Values()) {
+            Half expected{static_cast<double>(x) - static_cast<double>(y)};
+            ExpectEqFloat16(expected, x - y);
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16Multiply) {
+    for (const Half& x : GetFloat16Values()) {
+        for (const Half& y : GetFloat16Values()) {
+            Half expected{static_cast<double>(x) * static_cast<double>(y)};
+            ExpectEqFloat16(expected, x * y);
+            ExpectEqFloat16(expected, y * x);
+            EXPECT_EQ(expected.data(), (x * y).data());
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16Divide) {
+    for (const Half& x : GetFloat16Values()) {
+        for (const Half& y : GetFloat16Values()) {
+            Half expected{static_cast<double>(x) / static_cast<double>(y)};
+            ExpectEqFloat16(expected, x / y);
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16AddI) {
+    for (const Half& x : GetFloat16Values()) {
+        for (Half y : GetFloat16Values()) {
+            Half expected{static_cast<double>(y) + static_cast<double>(x)};
+            Half z = (y += x);
+            ExpectEqFloat16(expected, y);
+            ExpectEqFloat16(expected, z);
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16SubtractI) {
+    for (const Half& x : GetFloat16Values()) {
+        for (Half y : GetFloat16Values()) {
+            Half expected{static_cast<double>(y) - static_cast<double>(x)};
+            Half z = y -= x;
+            ExpectEqFloat16(expected, y);
+            ExpectEqFloat16(expected, z);
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16MultiplyI) {
+    for (const Half& x : GetFloat16Values()) {
+        for (Half y : GetFloat16Values()) {
+            Half expected{static_cast<double>(y) * static_cast<double>(x)};
+            Half z = y *= x;
+            ExpectEqFloat16(expected, y);
+            ExpectEqFloat16(expected, z);
+        }
+    }
+}
+
+TEST(NativeFloat16Test, Float16DivideI) {
+    for (const Half& x : GetFloat16Values()) {
+        for (Half y : GetFloat16Values()) {
+            Half expected{static_cast<double>(y) / static_cast<double>(x)};
+            Half z = y /= x;
+            ExpectEqFloat16(expected, y);
+            ExpectEqFloat16(expected, z);
+        }
+    }
+}
+
+}  // namespace
+}  // namespace chainerx
diff --git a/chainerx_cc/chainerx/python/core_module.cc b/chainerx_cc/chainerx/python/core_module.cc
index 75babe8cbce8..c61b033a5f12 100644
--- a/chainerx_cc/chainerx/python/core_module.cc
+++ b/chainerx_cc/chainerx/python/core_module.cc
@@ -1,3 +1,7 @@
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
 #include <pybind11/pybind11.h>
 
 #include "chainerx/python/array.h"
@@ -57,17 +61,18 @@ void InitChainerxModule(pybind11::module& m) {
     testing::testing_internal::InitChainerxTestingModule(m_testing);
 
     // Modifies __doc__ property of a pybind-generated function object.
-    m.def("_set_pybind_doc", [](py::handle obj, std::string docstring) {
+    m.def("_set_pybind_doc", [](py::handle obj, const std::string& docstring) {
         if (!py::isinstance<py::function>(obj)) {
             throw py::type_error{"Object is not a function."};
         }
 
-        // This function is called only sequentially from Python module.
-        // No need of race guard.
-        static std::vector<std::string>* docstrings_keeper = new std::vector<std::string>{};
+        // std::malloc should be used here, since pybind uses std::free to free ml_doc.
+        auto* c_docstring = static_cast<char*>(std::malloc(docstring.size() + 1));
+        if (c_docstring == nullptr) {
+            return;
+        }
 
-        docstrings_keeper->emplace_back(std::move(docstring));
-        const char* c_docstring = docstrings_keeper->back().c_str();
+        std::strncpy(c_docstring, docstring.c_str(), docstring.size() + 1);
 
         auto func = py::cast<py::function>(obj);
         auto cfunc = func.cpp_function();
diff --git a/chainerx_cc/chainerx/routines/connection_test.cc b/chainerx_cc/chainerx/routines/connection_test.cc
index 2143b1669f77..5ba4346fd1d9 100644
--- a/chainerx_cc/chainerx/routines/connection_test.cc
+++ b/chainerx_cc/chainerx/routines/connection_test.cc
@@ -56,8 +56,8 @@ TEST_THREAD_SAFE_P(ConnectionTest, Conv2d) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f);
     Array b = testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f});
 
     Array e = testing::BuildArray(out_shape).WithData<float>(
@@ -97,8 +97,8 @@ TEST_THREAD_SAFE_P(ConnectionTest, ConvNd) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<double>(-x_shape.GetTotalSize() / 2, 1.0).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<double>(-w_shape.GetTotalSize() / 2, 1.0);
+    Array x = testing::BuildArray(x_shape).WithLinearData<double>(-x_shape.GetTotalSize() / 2.0f, 1.0).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<double>(-w_shape.GetTotalSize() / 2.0f, 1.0);
     Array b = testing::BuildArray(b_shape).WithData<double>({-0.2, 1.3});
 
     Array e = testing::BuildArray(out_shape).WithData<double>(
@@ -138,8 +138,8 @@ TEST_THREAD_SAFE_P(ConnectionTest, ConvCoverAll) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f);
     Array b = testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f});
 
     Array e =
@@ -186,8 +186,8 @@ TEST_P(ConnectionTest, ConvBackward) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1)).RequireGrad();
-    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f)).RequireGrad();
+    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1)).RequireGrad();
+    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f)).RequireGrad();
     Array b = (*testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f})).RequireGrad();
 
     Array go = testing::BuildArray(out_shape).WithLinearData(-0.1f, 0.1f).WithPadding(1);
@@ -234,8 +234,8 @@ TEST_P(ConnectionTest, ConvCoverAllBackward) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1)).RequireGrad();
-    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f)).RequireGrad();
+    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1)).RequireGrad();
+    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f)).RequireGrad();
     Array b = (*testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f})).RequireGrad();
 
     Array go = testing::BuildArray(out_shape).WithLinearData(-0.1f, 0.1f).WithPadding(1);
@@ -278,8 +278,8 @@ TEST_P(ConnectionTest, ConvDoubleBackward) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1)).RequireGrad();
-    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f)).RequireGrad();
+    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1)).RequireGrad();
+    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f)).RequireGrad();
     Array b = (*testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f})).RequireGrad();
 
     Array go = (*testing::BuildArray(out_shape).WithLinearData(-0.3f, 0.1f).WithPadding(1)).RequireGrad();
@@ -332,8 +332,8 @@ TEST_P(ConnectionTest, ConvCoverAllDoubleBackward) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1)).RequireGrad();
-    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f)).RequireGrad();
+    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1)).RequireGrad();
+    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f)).RequireGrad();
     Array b = (*testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f})).RequireGrad();
 
     Array go = (*testing::BuildArray(out_shape).WithLinearData(-0.3f, 0.1f).WithPadding(1)).RequireGrad();
@@ -381,8 +381,8 @@ TEST_THREAD_SAFE_P(ConnectionTest, ConvTranspose) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f);
     Array b = testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f});
 
     Array e = testing::BuildArray(out_shape).WithData<float>(
@@ -452,8 +452,8 @@ TEST_THREAD_SAFE_P(ConnectionTest, ConvTransposeOutSize) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1);
-    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f);
+    Array x = testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1);
+    Array w = testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f);
     Array b = testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f});
 
     Array e = testing::BuildArray(out_shape).WithData<float>(
@@ -522,8 +522,8 @@ TEST_P(ConnectionTest, ConvTransposeBackward) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1)).RequireGrad();
-    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f)).RequireGrad();
+    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1)).RequireGrad();
+    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f)).RequireGrad();
     Array b = (*testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f})).RequireGrad();
 
     Array go = testing::BuildArray(out_shape).WithLinearData(-0.1f, 0.1f).WithPadding(1);
@@ -565,8 +565,8 @@ TEST_P(ConnectionTest, ConvTransposeDoubleBackward) {
     Shape out_shape{batch_size, out_channels};
     std::copy(out_dims.begin(), out_dims.end(), std::back_inserter(out_shape));
 
-    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2, 1.0f).WithPadding(1)).RequireGrad();
-    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2, 1.0f)).RequireGrad();
+    Array x = (*testing::BuildArray(x_shape).WithLinearData<float>(-x_shape.GetTotalSize() / 2.0f, 1.0f).WithPadding(1)).RequireGrad();
+    Array w = (*testing::BuildArray(w_shape).WithLinearData<float>(-w_shape.GetTotalSize() / 2.0f, 1.0f)).RequireGrad();
     Array b = (*testing::BuildArray(b_shape).WithData<float>({-0.2f, 1.3f})).RequireGrad();
 
     Array go = (*testing::BuildArray(out_shape).WithLinearData(-0.3f, 0.1f).WithPadding(1)).RequireGrad();
diff --git a/chainerx_cc/chainerx/routines/manipulation.cc b/chainerx_cc/chainerx/routines/manipulation.cc
index 2409a0371afc..a17c62701c79 100644
--- a/chainerx_cc/chainerx/routines/manipulation.cc
+++ b/chainerx_cc/chainerx/routines/manipulation.cc
@@ -168,8 +168,8 @@ Array Reshape(const Array& a, const Shape& newshape) {
         Shape reduced_shape{};
         Strides reduced_strides{};
         if (total_size == 1) {
-            reduced_shape.push_back(int64_t{1});
-            reduced_strides.push_back(item_size);
+            reduced_shape.emplace_back(int64_t{1});
+            reduced_strides.emplace_back(item_size);
         } else {
             int8_t i = 0;
             // Ignore preceding 1-length dimensions
@@ -193,8 +193,8 @@ Array Reshape(const Array& a, const Shape& newshape) {
                     reduced_strides.back() = st;
                 } else {
                     // Otherwise, add a new shape and stride.
-                    reduced_shape.push_back(dim);
-                    reduced_strides.push_back(st);
+                    reduced_shape.emplace_back(dim);
+                    reduced_strides.emplace_back(st);
                 }
             }
         }
@@ -209,7 +209,7 @@ Array Reshape(const Array& a, const Shape& newshape) {
             size_t i_dim = 0;
             for (int64_t dim : out_shape) {
                 if (dim <= 1) {
-                    strides.push_back(last_stride);
+                    strides.emplace_back(last_stride);
                     continue;
                 }
                 if (i_dim >= reduced_shape.size() || reduced_shape[i_dim] % dim != 0) {
@@ -219,7 +219,7 @@ Array Reshape(const Array& a, const Shape& newshape) {
                 }
                 reduced_shape[i_dim] /= dim;
                 last_stride = reduced_shape[i_dim] * reduced_strides[i_dim];
-                strides.push_back(last_stride);
+                strides.emplace_back(last_stride);
                 if (reduced_shape[i_dim] == 1) {
                     ++i_dim;
                 }
@@ -412,7 +412,7 @@ Array ConcatenateImpl(const std::vector<Array>& arrays, int8_t axis) {
             }
         }
         if (indices.size() < arrays.size() - 1) {
-            indices.push_back(shape[axis]);
+            indices.emplace_back(shape[axis]);
         }
     }
 
@@ -568,7 +568,7 @@ Array Stack(const std::vector<Array>& arrays, int8_t axis) {
                 const Array& gout = *bctx.output_grad();
                 std::vector<Array> gxs = StackGrad(gout, axis);
                 for (size_t i = 0; i < gxs.size(); ++i) {
-                    bctx.input_grad(i) = gxs[i];
+                    bctx.input_grad(i) = std::move(gxs[i]);
                 }
             });
         }
@@ -628,7 +628,8 @@ std::vector<Array> Split(const Array& ary, int64_t sections, int8_t axis) {
     int64_t out_stride = ary.strides()[axis_norm];
     int64_t out_offset = ary.offset();
 
-    std::vector<Array> out;
+    std::vector<Array> out{};
+    out.reserve(sections);
 
     for (int64_t i = 0; i < sections; ++i) {
         out.emplace_back(internal::MakeArray(out_shape, ary.strides(), ary.dtype(), ary.device(), ary.data(), out_offset));
@@ -651,19 +652,19 @@ std::vector<Array> Split(const Array& ary, std::vector<int64_t> indices, int8_t
     indices.emplace_back(in_dim);
 
     Shape out_shape = in_shape;
-    int64_t& out_dim = out_shape[axis_norm];
     int64_t out_stride = ary.strides()[axis_norm];
     int64_t out_offset = ary.offset();
     int64_t slice_start = 0;
 
-    std::vector<Array> out;
+    std::vector<Array> out{};
+    out.reserve(indices.size());
 
     for (int64_t index : indices) {
         int64_t slice_stop = std::min(in_dim, std::max(int64_t{0}, index));
         int64_t slice_step = slice_stop - slice_start;
 
         // Update the dimension of interest in the output shape.
-        out_dim = std::max(int64_t{0}, slice_step);
+        out_shape[axis_norm] = std::max(int64_t{0}, slice_step);
 
         out.emplace_back(internal::MakeArray(out_shape, ary.strides(), ary.dtype(), ary.device(), ary.data(), out_offset));
 
diff --git a/chainerx_cc/chainerx/routines/math.cc b/chainerx_cc/chainerx/routines/math.cc
index c28475561867..b84001350caf 100644
--- a/chainerx_cc/chainerx/routines/math.cc
+++ b/chainerx_cc/chainerx/routines/math.cc
@@ -394,7 +394,7 @@ Array AMax(const Array& a, const OptionalAxes& axis, bool keepdims) {
             Array reshaped_out{};
             if (keepdims) {
                 reshaped_gout = gout;
-                reshaped_out = out;
+                reshaped_out = std::move(out);
             } else {
                 // Add broadcastable dimensions to out and gout
                 // for each one that was reduced in the forward operation
diff --git a/chainerx_cc/chainerx/routines/normalization.cc b/chainerx_cc/chainerx/routines/normalization.cc
index a187b5b83bd3..0dfe93ec5dcc 100644
--- a/chainerx_cc/chainerx/routines/normalization.cc
+++ b/chainerx_cc/chainerx/routines/normalization.cc
@@ -111,12 +111,17 @@ Array BatchNorm(
                           BackwardContext& bctx) {
             const Array& gout = *bctx.output_grad();
 
-            std::array<Array, 3> ginputs = fb->Backward(gout.AsGradStopped());
-            internal::MakeViewForForwardBackwardOutput(ginputs);
-
-            const Array& gx = ginputs[0];
-            const Array& ggamma = ginputs[1];
-            const Array& gbeta = ginputs[2];
+            Array gx{};
+            Array ggamma{};
+            Array gbeta{};
+            {
+                std::array<Array, 3> ginputs = fb->Backward(gout.AsGradStopped());
+                internal::MakeViewForForwardBackwardOutput(ginputs);
+
+                gx = std::move(ginputs[0]);
+                ggamma = std::move(ginputs[1]);
+                gbeta = std::move(ginputs[2]);
+            }
 
             CHAINERX_ASSERT(internal::GetArrayBody(gx)->nodes().empty());
             CHAINERX_ASSERT(internal::GetArrayBody(ggamma)->nodes().empty());
@@ -169,18 +174,18 @@ Array BatchNorm(
 
                         Array ggamma2 = r / gamma_reshaped;
 
-                        bctx2.input_grad(0) = gx2;
-                        bctx2.input_grad(1) = ggamma2;
-                        bctx2.input_grad(2) = ggout2;
+                        bctx2.input_grad(0) = std::move(gx2);
+                        bctx2.input_grad(1) = std::move(ggamma2);
+                        bctx2.input_grad(2) = std::move(ggout2);
                     });
                 }
                 bb2.Finalize();
             }
 
             // TODO(niboshi): Assign at once
-            bctx.input_grad(0) = gx;
-            bctx.input_grad(1) = ggamma;
-            bctx.input_grad(2) = gbeta;
+            bctx.input_grad(0) = std::move(gx);
+            bctx.input_grad(1) = std::move(ggamma);
+            bctx.input_grad(2) = std::move(gbeta);
         });
     }
     bb.Finalize();
diff --git a/chainerx_cc/chainerx/routines/pooling.cc b/chainerx_cc/chainerx/routines/pooling.cc
index 1d95aa491a2f..c4310034c9b1 100644
--- a/chainerx_cc/chainerx/routines/pooling.cc
+++ b/chainerx_cc/chainerx/routines/pooling.cc
@@ -76,12 +76,12 @@ Array MaxPool(
                             }
                             bb3.Finalize();
                         }
-                        bctx2.input_grad() = ggout;
+                        bctx2.input_grad() = std::move(ggout);
                     });
                 }
                 bb2.Finalize();
             }
-            bctx1.input_grad() = gx;
+            bctx1.input_grad() = std::move(gx);
         }
 
         StackVector<int64_t, kMaxNdim> kernel_size;
@@ -132,7 +132,7 @@ Array AveragePool(
                     }
                     bb2.Finalize();
                 }
-                bctx.input_grad() = gx;
+                bctx.input_grad() = std::move(gx);
             });
         }
         bb1.Finalize();
diff --git a/chainerx_cc/third_party/gsl-lite.cmake b/chainerx_cc/third_party/gsl-lite.cmake
index 94ffdab1d15a..d1fce88034c2 100644
--- a/chainerx_cc/third_party/gsl-lite.cmake
+++ b/chainerx_cc/third_party/gsl-lite.cmake
@@ -4,7 +4,7 @@ project(gsl-lite-download NONE)
 include(ExternalProject)
 ExternalProject_Add(gsl-lite
         GIT_REPOSITORY    https://github.com/martinmoene/gsl-lite
-        GIT_TAG           v0.26.0
+        GIT_TAG           v0.32.0
         SOURCE_DIR        "${CMAKE_BINARY_DIR}/gsl-lite"
         BINARY_DIR        ""
         CONFIGURE_COMMAND ""
diff --git a/docs/image/model_parallel/averaging.png b/docs/image/model_parallel/averaging.png
new file mode 100644
index 000000000000..f7b91e48142f
Binary files /dev/null and b/docs/image/model_parallel/averaging.png differ
diff --git a/docs/image/model_parallel/comm_split.png b/docs/image/model_parallel/comm_split.png
new file mode 100644
index 000000000000..1f5a0579acf4
Binary files /dev/null and b/docs/image/model_parallel/comm_split.png differ
diff --git a/docs/image/model_parallel/communication_as_function.png b/docs/image/model_parallel/communication_as_function.png
new file mode 100644
index 000000000000..d7dee61e6d1a
Binary files /dev/null and b/docs/image/model_parallel/communication_as_function.png differ
diff --git a/docs/image/model_parallel/delegate_variable_0.png b/docs/image/model_parallel/delegate_variable_0.png
new file mode 100644
index 000000000000..df2ff4982ecc
Binary files /dev/null and b/docs/image/model_parallel/delegate_variable_0.png differ
diff --git a/docs/image/model_parallel/delegate_variable_1.png b/docs/image/model_parallel/delegate_variable_1.png
new file mode 100644
index 000000000000..94e6bba0a2fd
Binary files /dev/null and b/docs/image/model_parallel/delegate_variable_1.png differ
diff --git a/docs/image/model_parallel/empty_dataset.png b/docs/image/model_parallel/empty_dataset.png
new file mode 100644
index 000000000000..97538d47caf1
Binary files /dev/null and b/docs/image/model_parallel/empty_dataset.png differ
diff --git a/docs/image/model_parallel/model_parallel_mlp.png b/docs/image/model_parallel/model_parallel_mlp.png
new file mode 100644
index 000000000000..ee0a0684025b
Binary files /dev/null and b/docs/image/model_parallel/model_parallel_mlp.png differ
diff --git a/docs/image/model_parallel/multi_node_iterator.png b/docs/image/model_parallel/multi_node_iterator.png
new file mode 100644
index 000000000000..4a3781758fc2
Binary files /dev/null and b/docs/image/model_parallel/multi_node_iterator.png differ
diff --git a/docs/image/model_parallel/parallel_conv.png b/docs/image/model_parallel/parallel_conv.png
new file mode 100644
index 000000000000..b834d3674e6f
Binary files /dev/null and b/docs/image/model_parallel/parallel_conv.png differ
diff --git a/docs/image/model_parallel/pseudo_connect_0.png b/docs/image/model_parallel/pseudo_connect_0.png
new file mode 100644
index 000000000000..5586c88e34db
Binary files /dev/null and b/docs/image/model_parallel/pseudo_connect_0.png differ
diff --git a/docs/image/model_parallel/pseudo_connect_1.png b/docs/image/model_parallel/pseudo_connect_1.png
new file mode 100644
index 000000000000..79cb45483232
Binary files /dev/null and b/docs/image/model_parallel/pseudo_connect_1.png differ
diff --git a/docs/image/model_parallel/scatter_dataset.png b/docs/image/model_parallel/scatter_dataset.png
new file mode 100644
index 000000000000..996ae4c8d7f2
Binary files /dev/null and b/docs/image/model_parallel/scatter_dataset.png differ
diff --git a/docs/image/model_parallel/seq2seq_0.png b/docs/image/model_parallel/seq2seq_0.png
new file mode 100644
index 000000000000..0f320be497d5
Binary files /dev/null and b/docs/image/model_parallel/seq2seq_0.png differ
diff --git a/docs/image/model_parallel/seq2seq_1.png b/docs/image/model_parallel/seq2seq_1.png
new file mode 100644
index 000000000000..6790573b5c7a
Binary files /dev/null and b/docs/image/model_parallel/seq2seq_1.png differ
diff --git a/docs/image/model_parallel/spmd.png b/docs/image/model_parallel/spmd.png
new file mode 100644
index 000000000000..6930cc1883fa
Binary files /dev/null and b/docs/image/model_parallel/spmd.png differ
diff --git a/docs/source/chainermn/index.rst b/docs/source/chainermn/index.rst
index f2b225207958..15f86ccc2787 100644
--- a/docs/source/chainermn/index.rst
+++ b/docs/source/chainermn/index.rst
@@ -25,5 +25,6 @@ These examples are based on the `examples of Chainer <https://github.com/chainer
 
    installation/index
    tutorial/index
+   model_parallel/index
    reference/index
 
diff --git a/docs/source/chainermn/model_parallel/example1_simple_mlp.rst b/docs/source/chainermn/model_parallel/example1_simple_mlp.rst
new file mode 100644
index 000000000000..db9ee8a73e95
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/example1_simple_mlp.rst
@@ -0,0 +1,109 @@
+Example 1: Simple MLP
+=====================
+
+Here is the first example of model parallel, a simple MLP separated on two processes.
+
+.. figure:: ../../../image/model_parallel/model_parallel_mlp.png
+    :align: center
+
+First, let's create a ChainerMN communicator::
+
+    if args.gpu:
+        comm = chainermn.create_communicator('hierarchical')
+        device = comm.intra_rank
+    else:
+        comm = chainermn.create_communicator('naive')
+        device = -1
+
+As we saw in :doc:`model_parallel_on_chainermn`, one naive implementation would be to use the point-to-point communication such as ``send`` and ``recv``::
+
+    class MLP0(chainer.Chain):
+        def __init__(self, comm, n_out):
+            super(MLP0SubA, self).__init__(
+                l1=L.Linear(784, n_out))
+
+        def __call__(self, x):
+            h0 = F.relu(self.l1(x))
+            phi = chainermn.functions.send(h0, self.comm, rank=1)
+            # Note: do not forget to pass delegate variable
+            y = chainermn.functions.recv(self.comm, rank=1, delegate_variable=phi)
+            return y
+
+    class MLP1(chainer.Chain):
+        def __init__(self, n_units, n_out):
+            super(MLP1Sub, self).__init__(
+                l2=L.Linear(None, n_units),
+                l3=L.Linear(None, n_out))
+
+        def __call__(self, _):
+            h0 = chainermn.functions.recv(self.comm, rank=0)
+            h1 = F.relu(self.l2(h0))
+            return chainermn.functions.send(self.l3(h1), self.comm, rank=0)
+
+One should note that
+
++ ``MLP0``: delegate variable is indispensable which is passed from ``send`` to ``recv``.
++ ``MLP1``: the return value from ``send`` must be returned in ``__call__``, which is used to track back the computational graph.
+
+On each process, different models are trained::
+
+    if comm.rank == 0:
+        model = L.Classifier(MLP0(comm, 100))
+    elif comm.rank == 1:
+        model = MLP1(comm, 100, 10)
+
+Since ``MLP1`` receives its inputs from ``MLP0`` over the point-to-point communication, let's use ``empty_dataset`` instead of the usual dataset::
+
+    # Iterate dataset only on worker 0.
+    train, test = chainer.datasets.get_mnist()
+    if comm.rank == 1:
+        train = chainermn.datasets.create_empty_dataset(train)
+        test = chainermn.datasets.create_empty_dataset(test)
+
+Now we can run a model parallel architecture.
+
+There is an alternative API to define the same model without explicitly defining communication paths::
+
+	class MLP0SubA(chainer.Chain):
+	    def __init__(self, comm, n_out):
+	        super(MLP0SubA, self).__init__(
+	            l1=L.Linear(784, n_out))
+	
+	    def __call__(self, x):
+	        return F.relu(self.l1(x))
+
+	class MLP0SubB(chainer.Chain):
+	    def __init__(self, comm):
+	        super(MLP0SubB, self).__init__()
+	
+	    def __call__(self, y):
+	        return y
+
+	class MLP0(chainermn.MultiNodeChainList):
+	    # Model on worker 0.
+	    def __init__(self, comm, n_out):
+	        super(MLP0, self).__init__(comm=comm)
+	        self.add_link(MLP0SubA(comm, n_out), rank_in=None, rank_out=1)
+	        self.add_link(MLP0SubB(comm), rank_in=1, rank_out=None)
+
+	class MLP1Sub(chainer.Chain):
+	    def __init__(self, n_units, n_out):
+	        super(MLP1Sub, self).__init__(
+	            l2=L.Linear(None, n_units),
+	            l3=L.Linear(None, n_out))
+	
+	    def __call__(self, h0):
+	        h1 = F.relu(self.l2(h0))
+	        return self.l3(h1)
+
+	class MLP1(chainermn.MultiNodeChainList):
+	    # Model on worker 1.
+	    def __init__(self, comm, n_units, n_out):
+	        super(MLP1, self).__init__(comm=comm)
+	        self.add_link(MLP1Sub(n_units, n_out), rank_in=0, rank_out=0)
+
+``MultiNodeChainList`` enables to define a multi model architecture, by adding non-connected component with ``add_link``.
+Two arguments ``rank_in`` and ``rank_out`` specifies from which process the added link receives their inputs, and to which process it sends their outputs.
+
+Although it may seems that there is no necessity to parallelize MLP with this size, it can be useful to train a MLP with many layers and parameters so that the entire model cannot be loaded on a single GPU.
+The entire training code is available `here <https://github.com/chainer/chainer/blob/master/examples/chainermn/mnist/train_mnist_model_parallel.py>`__.
diff --git a/docs/source/chainermn/model_parallel/example2_seq2seq.rst b/docs/source/chainermn/model_parallel/example2_seq2seq.rst
new file mode 100644
index 000000000000..37d42f680910
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/example2_seq2seq.rst
@@ -0,0 +1,66 @@
+Example 2: seq2seq
+==================
+
+This example shows how to parallelize models that involves RNN.
+
+.. figure:: ../../../image/model_parallel/seq2seq_0.png
+    :align: center
+
+Above figure depicts a typical encoder-decoder model, where the model is split up to encoder and decoder, both running respectively in two processes.
+When ``f`` or ``g`` are large models that consume huge memory such as CNN, model parallelism like this would be useful.
+In the forward computation, the encoder invokes ``send`` function to send its context vectors, and the decoder invokes ``recv`` to receive them.
+The backward computation must be built by ``pseudo_connect``.
+As this communication pattern is very popular in RNNs, ``MultiNodeNStepRNN`` is a ready-made utility link for this pattern.
+It can replace this complicated communication pattern.
+
+.. figure:: ../../../image/model_parallel/seq2seq_1.png
+    :align: center
+    :scale: 50%
+
+``MultiNodeNStepRNN`` can be created by ``create_multi_node_n_step_rnn``::
+
+    rnn = chainermn.links.create_multi_node_n_step_rnn(
+        L.NStepLSTM(n_layers, n_units, n_units, 0.1),
+        comm, rank_in=None, rank_out=1)
+
+where ``comm`` is a ChainerMN communicator (see :ref:`chainermn-communicator`).
+
+The overall model definition can be written as follows::
+
+    class Encoder(chainer.Chain):
+
+        def __init__(self, comm, n_layers, n_units):
+            super(Encoder, self).__init__(
+                # Corresponding decoder LSTM will be invoked on process 1.
+                mn_encoder=chainermn.links.create_multi_node_n_step_rnn(
+                    L.NStepLSTM(n_layers, n_units, n_units, 0.1),
+                    comm, rank_in=None, rank_out=1
+                ),
+            )
+            self.comm = comm
+            self.n_layers = n_layers
+            self.n_units = n_units
+
+        def __call__(self, *xs):
+            exs = f(xs)
+            c, h, _, phi = self.mn_encoder(exs) 
+            return phi
+
+    class Decoder(chainer.Chain):
+
+        def __init__(self, comm, n_layers, n_units):
+            super(Decoder, self).__init__(
+                # Corresponding encoder LSTM will be invoked on process 0.
+                mn_decoder=chainermn.links.create_multi_node_n_step_rnn(
+                    L.NStepLSTM(n_layers, n_units, n_units, 0.1),
+                    comm, rank_in=0, rank_out=None),
+            )
+            self.comm = comm
+            self.n_layers = n_layers
+            self.n_units = n_units
+
+        def __call__(self, *ys):
+            c, h, os, _ = self.mn_decoder(ys)
+            # compute loss (omitted)
+
+An example code with a training script is available `here <https://github.com/chainer/chainer/blob/master/examples/chainermn/seq2seq/seq2seq_mp1.py>`__.
diff --git a/docs/source/chainermn/model_parallel/example3_parallel_conv.rst b/docs/source/chainermn/model_parallel/example3_parallel_conv.rst
new file mode 100644
index 000000000000..7fc01325d69c
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/example3_parallel_conv.rst
@@ -0,0 +1,71 @@
+Example 3: Channel-wise Parallel Convolution
+============================================
+
+This is an example to parallelize CNN in channel-wise manner.
+This parallelization is useful with large batch size, or with high resolution images.
+
+.. figure:: ../../../image/model_parallel/parallel_conv.png
+    :align: center
+
+The basic strategy is
+
+1. to pick channels that each process is responsible for
+2. to apply convolution, and
+3. to use ``allgather`` to combine outputs of all channels into a single tensor
+
+on each process.
+Parallel convolution model implementation could be like this::
+
+    class ParallelConvolution2D(chainer.links.Convolution2D):
+        def __init__(self, comm, in_channels, out_channels, *args, **kwargs):
+            self.comm = comm
+            self.in_channels = in_channels
+            self.out_channels = out_channels
+            super(ParallelConvolution2D, self).__init__(
+                self._in_channel_size, self._out_channel_size, *args, **kwargs)
+
+        def __call__(self, x):
+            x = x[:, self._channel_indices, :, :]
+            y = super(ParallelConvolution2D, self).__call__(x)
+            ys = chainermn.functions.allgather(self.comm, y)
+            return F.concat(ys, axis=1)
+
+        def _channel_size(self, n_channel):
+            # Return the size of the corresponding channels.
+            n_proc = self.comm.size
+            i_proc = self.comm.rank
+            return n_channel // n_proc + (1 if i_proc < n_channel % n_proc else 0)
+
+        @property
+        def _in_channel_size(self):
+            return self._channel_size(self.in_channels)
+
+        @property
+        def _out_channel_size(self):
+            return self._channel_size(self.out_channels)
+
+        @property
+        def _channel_indices(self):
+            # Return the indices of the corresponding channel.
+            indices = np.arange(self.in_channels)
+            indices = indices[indices % self.comm.size == 0] + self.comm.rank
+            return [i for i in indices if i < self.in_channels]
+
+where ``comm`` is a ChainerMN communicator (see :ref:`chainermn-communicator`).
+
+``ParallelConvolution2D`` can simply replace with the original ``Convolution2D``.
+For the first convolution layer, all processes must input the same images to the model.
+``MultiNodeIterator`` distributes the same batches to all processes every iteration::
+
+    if comm.rank != 0:
+        train = chainermn.datasets.create_empty_dataset(train)
+        test = chainermn.datasets.create_empty_dataset(test)
+
+    train_iter = chainermn.iterators.create_multi_node_iterator(
+        chainer.iterators.SerialIterator(train, args.batchsize), comm)
+    test_iter = chainermn.iterators.create_multi_node_iterator(
+        chainer.iterators.SerialIterator(test, args.batchsize,
+                                         repeat=False, shuffle=False),
+        comm)
+
+An example code with a training script for VGG16 parallelization is available `here <https://github.com/chainer/chainer/blob/master/examples/chainermn/parallel_convolution/>`__.
diff --git a/docs/source/chainermn/model_parallel/example4_ensemble.rst b/docs/source/chainermn/model_parallel/example4_ensemble.rst
new file mode 100644
index 000000000000..de555e315d08
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/example4_ensemble.rst
@@ -0,0 +1,64 @@
+Example 4: Ensemble
+===================
+
+Ensemble is a training technique to obtain better classification performance by combining multiple base classifiers.
+Averaging ensemble is one of the simplest examples of ensemble, which takes average of all classifier outputs in the test phase.
+Model parallelism and collective communications can effectively help to implement it.
+
+.. figure:: ../../../image/model_parallel/averaging.png
+    :align: center
+
+The following wrapper makes model parallel averaging ensemble easier::
+
+    class Averaging(chainer.Chain):
+        def __init__(self, comm, block):
+            super(Averaging, self).__init__()
+            self.comm = comm
+            with self.init_scope():
+                self.block = block
+
+        def __call__(self, x):
+            y = self.block(x)
+    
+            if not chainer.config.train:
+                y = chainermn.functions.allgather(self.comm, y)
+                y = F.stack(y, axis=0)
+                y = F.average(y, axis=0)
+
+            return y
+
+Then, any links wrapped by ``Averaging`` are ready to be parallelized and averaged::
+
+    class Model(chainer.Chain):
+        def __init__(self, comm):
+            super(Model, self).__init__()
+            self.comm = comm
+            with self.init_scope():
+                self.l1 = L.Linear(d0, d1)
+                self.l2 = L.Linear(d1, d2)
+                self.l3 = Averaging(self.comm, L.Linear(d2, d3))
+
+        def __call__(self, x):
+            h = F.relu(self.l1(x))
+            h = F.relu(self.l2(h))
+            y = F.relu(self.l3(h))
+            return y
+
+From the perspective of model inputs/outputs, the averaged model is compatible with the original model.
+Thus, we only need to replace the last layer with the averaged layer.
+
+In averaging ensemble, each base classifier is trained independently and ensembled in the test phase.
+This can be implemented by using ``MultiNodeIterator`` only for the test iterator::
+
+    # train = (training dataset)
+    # test = (test dataset)
+
+    if comm.rank != 0:
+        train = chainermn.datasets.create_empty_dataset(train)
+        test = chainermn.datasets.create_empty_dataset(test)
+
+    train_iter = chainer.iterators.SerialIterator(train, batchsize)
+    test_iter = chainermn.iterators.create_multi_node_iterator(
+        chainer.iterators.SerialIterator(test, batchsize,
+                                         repeat=False, shuffle=False),
+        comm)
diff --git a/docs/source/chainermn/model_parallel/index.rst b/docs/source/chainermn/model_parallel/index.rst
new file mode 100644
index 000000000000..11123bb8a87c
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/index.rst
@@ -0,0 +1,14 @@
+.. module:: chainermn
+
+Model Parallel
+==============
+
+.. toctree::
+    :maxdepth: 2
+
+    overview
+    model_parallel_on_chainermn
+    example1_simple_mlp
+    example2_seq2seq
+    example3_parallel_conv
+    example4_ensemble
diff --git a/docs/source/chainermn/model_parallel/model_parallel_on_chainermn.rst b/docs/source/chainermn/model_parallel/model_parallel_on_chainermn.rst
new file mode 100644
index 000000000000..457d257a022e
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/model_parallel_on_chainermn.rst
@@ -0,0 +1,205 @@
+Model Parallel on ChainerMN
+===========================
+
+
+.. _chainermn-communicator:
+
+Step 1: Communicators
+~~~~~~~~~~~~~~~~~~~~~
+
+To perform multi-node communications, a *communicator* is needed.
+Basic usages are the same with the case of the data parallel, see :doc:`../tutorial/step1_communicators_optimizers`::
+
+    comm = chainermn.create_communicator()
+
+If you want to define collective communications among limited number of processes later, it is useful to split the communicator::
+
+    subcomm = comm.split(comm.rank % 2, comm.rank)
+
+.. figure:: ../../../image/model_parallel/comm_split.png
+    :align: center
+    :scale: 50%
+
+For further detail about the communicator split, please refer to `MPI tutorial <http://mpitutorial.com/tutorials/introduction-to-groups-and-communicators/>`__.
+
+
+
+Step 2: Datasets and Iterators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In model parallel training, all processes belong to at least one of the following dataset input patterns.
+
+1. model inputs come from datasets, and each process takes different mini-batches
+2. model inputs come from datasets, and several processes share the same mini-batches
+3. model inputs come from other processes
+
+
+1. scatter_dataset
+------------------
+
+For the first case, you may use ``scatter_dataset`` as is introduced in :doc:`../tutorial/step2_datasets_evaluators`.
+
+.. figure:: ../../../image/model_parallel/scatter_dataset.png
+    :align: center
+
+2. multi node iterator
+----------------------
+
+For the second case, iterator need to be modified, where ``create_multi_node_iterator`` is useful::
+
+    train, test = chainer.datasets.get_mnist()
+    train_iter = chainermn.iterators.create_multi_node_iterator(
+        chainer.iterators.SerialIterator(train, batchsize), comm)
+    test_iter = chainermn.iterators.create_multi_node_iterator(
+        chainer.iterators.SerialIterator(test, batchsize), comm)
+
+The resulting iterators return the same mini-batches among processes specified by the communicator.
+
+.. figure:: ../../../image/model_parallel/multi_node_iterator.png
+    :align: center
+
+3. empty dataset
+----------------
+
+For the last case, you may use ``create_empty_dataset``, which returns a dataset with the same number of empty tuples as the original dataset::
+
+    train, test = chainer.datasets.get_mnist()
+    train = chainermn.datasets.create_empty_dataset(train)
+    test = chainermn.datasets.create_empty_dataset(test)
+
+This input pattern appears in the subsequent examples such as :doc:`example1_simple_mlp`.
+Note that datasets are required in Chainer's updater API. The empty dataset can be used as a dummy dataset.
+
+.. figure:: ../../../image/model_parallel/empty_dataset.png
+    :align: center
+    :scale: 40%
+
+
+Step 3: Define Communications 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ChainerMN supports most of the MPI communications *as Chainer functions*, including point-to-point and collective communications.
+To know usages of each communication, please refer to :doc:`../reference/index`.
+
+Example 1: Point-to-point Communication
+---------------------------------------
+
+This is an example to use point-to-point communications::
+
+    def __call__(self, x):
+        h = f(x)
+        h = chainermn.functions.send(x, comm, rank=1)
+        return h
+
+The communication target is specified by ``rank`` parameter.
+Note that the return value of ``send`` is often not negligible.
+Please refer to :ref:`pseudo-connect`.
+
+
+Example 2: Collective Communication
+-----------------------------------
+
+Here is another example to use collective communications::
+
+    def __call__(self, x):
+        h = f(x)
+        h = chainermn.functions.allgather(comm, h)
+        h = F.stack(h, axis=0)
+        h = F.average(h, axis=0)
+        return h
+
+This pattern often appears in the averaging ensemble training.
+
+
+.. _pseudo-connect:
+
+Note: Define-by-Run and Model Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In model-parallel training, a model on each process may become *non-connected* computational graph.
+Let's take a look at an example.
+
+.. figure:: ../../../image/model_parallel/delegate_variable_0.png
+    :align: center
+    :scale: 50%
+
+Naive implementation of a model on process #0 could be::
+
+    class Model_0(chainer.Chain):
+        def __call__(self, x):
+            # first component
+            z = f(x)
+            chainermn.functions.send(z, comm, rank=1)
+
+            # second component
+            z = chainermn.functions.recv(comm, rank=1)
+            y = h(z)
+
+            return y
+
+One may notice that there is no connection between the first and second components of computational graph.
+As we rely on defined-by-run framework, we cannot build a backward path from the second component to the first component.
+In order to build the backward path, a dummy variable, which we call ``delegate_variable``, is needed.
+
+.. figure:: ../../../image/model_parallel/delegate_variable_1.png
+    :align: center
+    :scale: 50%
+
+The variable :math:`\phi` in the above figure is ``delegate_variable``, which is a return value of ``send`` and passed to an argument of ``recv``::
+
+    class Model_0(chainer.Chain):
+        def __call__(self, x):
+            # first component
+            z = f(x)
+            phi = chainermn.functions.send(z, comm, rank=1)
+
+            # second component
+            z = chainermn.functions.recv(comm, rank=1, delegate_variable=phi)
+            y = h(z)
+
+            return y
+
+    class Model_1(chainer.Chain):
+        def __call__(self, _):
+            z = chainermn.functions.recv(comm, rank=0)
+            z = g(z)
+            phi = chainermn.functions.send(z, comm, rank=0)
+            return phi
+
+``Model_1`` also need to return a delegate variable :math:`\phi` to backtrack its computational graph to compute gradients.
+Thus, the backward computation is guaranteed.
+**Otherwise, backward computation will cause deadlock**.
+
+
+Note: Delegate Variable and Pseudo Connect
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As we just see above, delegate variables must be appropriately handled to avoid potential deadlock.
+However, there are still some pathological cases.
+Let's consider to ``send`` variables twice.
+
+.. figure:: ../../../image/model_parallel/pseudo_connect_0.png
+    :align: center
+
+Here, we must guarantee that backward tracking can find two ``send``, but we can only return one delegate variable from each model.
+``pseudo_connect`` is a special function to combine one delegate variable to another variable.
+
+.. figure:: ../../../image/model_parallel/pseudo_connect_1.png
+    :align: center
+
+In the above case, the returned variable :math:`\psi` from ``pseudo_connect`` behaves as if it is :math:`\phi_2`, while its ``backward`` backtracks both :math:`\phi_1` and :math:`\phi_2`::
+
+    class Model_0(chainer.Chain):
+        def __call__(self, x):
+            z1, z2 = f(x)
+            phi1 = chainermn.functions.send(z1, comm, rank=1)
+            phi2 = chainermn.functions.send(z2, comm, rank=1)
+            psi = chainermn.functions.pseudo_connect(phi1, phi2)
+            return psi
+
+    class Model_1(chainer.Chain):
+        def __call__(self, _):
+            z1 = chainermn.functions.recv(comm, rank=0)
+            z2 = chainermn.functions.recv(comm, rank=0)
+            y = g(z1, z2)
+            return y
diff --git a/docs/source/chainermn/model_parallel/overview.rst b/docs/source/chainermn/model_parallel/overview.rst
new file mode 100644
index 000000000000..b5aa79519950
--- /dev/null
+++ b/docs/source/chainermn/model_parallel/overview.rst
@@ -0,0 +1,64 @@
+Overview
+========
+
+Model Parallelism
+~~~~~~~~~~~~~~~~~
+
+Even though ChainerMN mainly supports the data parallel approach for distributed training, it also has experimental APIs for the *model parallel* approach.
+The model parallel approach splits a given model into subcomponents loaded on several processes.
+This approach is useful in cases where
+
+- large mini-batch or high-resolusion is needed.
+- the model is too huge to run on a single process.
+- the mixture of experts are trained.
+
+.. figure:: ../../../image/parallelism.png
+    :align: center
+
+
+Philosophy
+~~~~~~~~~~
+
+ChainerMN takes the following three approaches to realize the model parallelism.
+
+1. Communication as Function
+----------------------------
+
+ChainerMN provides several special functions for communications such as ``chainermn.functions.bcast`` and ``chainermn.functions.alltoall``, which wraps raw MPI communications.
+Users define communications between processes as Chainer function calls in the model definitions.
+This enables highly flexible communication patterns.
+Moreover, parameter updates in backward propagation are automatically invoked through ``backward`` defined in those functions for communications.
+
+.. figure:: ../../../image/model_parallel/communication_as_function.png
+    :align: center
+    :scale: 40%
+
+
+2. Synchronous Model Parallel
+-----------------------------
+
+ChainerMN restricts itself to synchronous SGD.
+Though the asynchronous counterpart seems to be more computationally efficient, asynchronous SGD often suffer from the stale gradients problem and results in difficulty while debugging.
+ChainerMN's synchronous communication model makes SGD simpler.
+
+
+3. Single-Program-Multiple-Data (SPMD)
+--------------------------------------
+
+In principle, ChainerMN supports single-program-multiple-data (SPMD), which means the same program is invoked and different data are used on each process.
+
+.. figure:: ../../../image/model_parallel/spmd.png
+    :align: center
+
+Synchronous model-parallelism suits well with MPI programming style and SPMD model.
+
+
+References
+~~~~~~~~~~
+
+- `More Effective Distributed ML via a Stale Synchronous Parallel Parameter Server <https://papers.nips.cc/paper/4894-more-effective-distributed-ml-via-a-stale-synchronous-parallel-parameter-server.pdf>`__
+- `Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer <https://arxiv.org/pdf/1701.06538.pdf>`__
+- `AMPNet: Asynchronous Model-Parallel Training for Dynamic Neural Networks <https://arxiv.org/pdf/1705.09786.pdf>`__
+- `Deep Mixture of Experts via Shallow Embedding <https://arxiv.org/pdf/1806.01531.pdf>`__
+- `Mesh-TensorFlow: Deep Learning for Supercomputers <https://papers.nips.cc/paper/8242-mesh-tensorflow-deep-learning-for-supercomputers.pdf>`__
+- `GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism <https://arxiv.org/pdf/1811.06965.pdf>`__
diff --git a/docs/source/compatibility.rst b/docs/source/compatibility.rst
index 08ff664c0976..f6d049a8097c 100644
--- a/docs/source/compatibility.rst
+++ b/docs/source/compatibility.rst
@@ -12,14 +12,6 @@ Developers should read through this documentation before creating pull requests
 Note that this documentation may contain ambiguities on the level of supported compatibilities.
 
 
-Targeted Versions
------------------
-
-**This policy is applied to Chainer v2.0.0 and higher.**
-Note that this policy is not applied to Chainer of lower versions.
-For older versions of Chainer, see `the old version of API Compatiblity Policy <https://docs.chainer.org/en/v1.24.0/compatibility.html>`_.
-
-
 Versioning and Backward Compatibility
 -------------------------------------
 
@@ -49,13 +41,6 @@ The following list shows an example of what we can do to reduce the cost (*Note:
 - When a definition of a link is changed, we try to enable it to deserialize a model dumped with an older version of Chainer.
   In most cases, we cannot guarantee that a model serialized with a newer version of Chainer is loadable by an older version of Chainer.
 
-.. note::
-
-   Since Chainer v2, we have stopped adopting any solid processes to break backward compatibilities (e.g. a solid schedule for deprecating and removing a feature) in order to keep the development fast enough to support the cutting-edge research.
-   **It does not mean we stop taking care of maintainability of user codes.**
-   We are still paying much attention to not breaking user codes.
-
-
 .. module:: chainer.utils
 
 Experimental APIs
diff --git a/docs/source/contribution.rst b/docs/source/contribution.rst
index c6a6d8337b47..ef7519e06fcf 100644
--- a/docs/source/contribution.rst
+++ b/docs/source/contribution.rst
@@ -7,11 +7,6 @@ This is a guide for all contributions to Chainer.
 The development of Chainer is running on `the official repository at GitHub <https://github.com/chainer/chainer>`_.
 Anyone that wants to register an issue or to send a pull request should read through this document.
 
-.. note::
-
-   Many points of this documentation are updated at v2.
-   We strongly recommend all contributors of v1 to read through the documentation again.
-
 Classification of Contributions
 -------------------------------
 
@@ -50,7 +45,7 @@ As for the backward compatibility, see :ref:`compatibility`.
 Release Cycle
 ~~~~~~~~~~~~~
 
-Starting from v2.0.0, we are developing two tracks of versions at the same time.
+We develop two tracks of versions at the same time.
 The first one is the track of **stable versions**, which is a series of revision updates for the latest major version.
 The second one is the track of **development versions**, which is a series of pre-releases for the upcoming major version.
 
diff --git a/docs/source/examples/cnn.rst b/docs/source/examples/cnn.rst
index 2a336c3c152e..840ff93c1308 100644
--- a/docs/source/examples/cnn.rst
+++ b/docs/source/examples/cnn.rst
@@ -36,6 +36,9 @@ LeNet5
 ''''''
 
 Here, let's start by defining LeNet5 [LeCun98]_ in Chainer.
+In this example, we show 
+`a simplified version of LeNet5 <http://deeplearning.net/tutorial/lenet.html#lenet>`_
+introduced in Deep Learning Tutorials.
 This is a ConvNet model that has 5 layers comprised of 3 convolutional layers
 and 2 fully-connected layers. This was proposed to classify hand-written
 digit images in 1998. In Chainer, the model can be written as follows:
@@ -89,13 +92,7 @@ When it's in training mode, :meth:`forward` returns the output value of the
 last layer as is to compute the loss later on, otherwise it returns a
 prediction result by calculating :meth:`~chainer.functions.softmax`.
 
-.. note::
-
-  In Chainer v1, if a function or link behaved differently in
-  training and other modes, it was common that it held an attribute
-  that represented its running mode or was provided with the mode
-  from outside as an argument. In Chainer v2, it is recommended to use
-  the global configuration ``chainer.config.train`` to switch the running mode.
+It is recommended to use the global configuration ``chainer.config.train`` to switch the running mode.
 
 If you don't want to write ``conv1`` and the other layers more than once, you
 can also write the same model like in this way:
diff --git a/docs/source/guides/gpu.rst b/docs/source/guides/gpu.rst
index ae742b57f585..c4f6c30e113d 100644
--- a/docs/source/guides/gpu.rst
+++ b/docs/source/guides/gpu.rst
@@ -33,7 +33,6 @@ Relationship between Chainer and CuPy
 
 .. note::
 
-   From v2.0.0, CuPy is turned into a separate package and repository.
    Even if you have CUDA installed in your environment, you have to install CuPy separately to use GPUs.
    See :ref:`install_cuda` for the way to set up CUDA support.
 
diff --git a/docs/source/guides/links.rst b/docs/source/guides/links.rst
index 1647d8f77552..a45eff500b50 100644
--- a/docs/source/guides/links.rst
+++ b/docs/source/guides/links.rst
@@ -71,10 +71,6 @@ It can be done by calling the :meth:`~Link.cleargrads` method.
 
    >>> f.cleargrads()
 
-.. note::
-   :meth:`~Link.cleargrads` is introduced in v1.15 to replace :meth:`~Link.zerograds` for efficiency.
-   :meth:`~Link.zerograds` is left only for backward compatibility.
-
 Now we can compute the gradients of parameters by simply calling the backward method and access them via the ``grad`` property.
 
 .. doctest::
diff --git a/docs/source/guides/models.rst b/docs/source/guides/models.rst
index 660b39167964..d3112fe516a6 100644
--- a/docs/source/guides/models.rst
+++ b/docs/source/guides/models.rst
@@ -60,15 +60,6 @@ It means we can define more complex chains that hold ``MyChain`` objects as thei
    We often define a single forward method of a link by the ``forward`` operator.
    Such links and chains are callable and behave like regular functions of Variables.
 
-.. note::
-
-    In Chainer v1, we could also register the trainable layers
-    (i.e., :class:`~chainer.Link` s) to the model by putting them to the
-    :meth:`~chainer.Chain.__init__` of :class:`~chainer.Chain`
-    or registering them via :meth:`~chainer.Chain.add_link`.
-    But as these ways are deprecated in Chainer v2, users are recommended
-    to use the way explained above.
-
 Another way to define a chain is using the :class:`ChainList` class, which behaves like a list of links:
 
 .. doctest::
diff --git a/docs/source/reference/debug.rst b/docs/source/reference/debug.rst
index bb056fb8dd8c..82effe2baf98 100644
--- a/docs/source/reference/debug.rst
+++ b/docs/source/reference/debug.rst
@@ -29,18 +29,3 @@ You can check if debug mode is enabled with :func:`chainer.is_debug` function.
 
    chainer.is_debug
    chainer.set_debug
-
-
-Deprecated interface
---------------------
-
-As of v2.0.0, it is recommended to turn on the debug mode using ``chainer.config.debug``.
-See :ref:`configuration` for the way to use the config object.
-We leave the reference of the conventional way (which has been available since Chainer v1) as follows.
-
-
-.. autosummary::
-   :toctree: generated/
-   :nosignatures:
-
-   chainer.DebugMode
diff --git a/docs/source/reference/distributions.rst b/docs/source/reference/distributions.rst
index 60ee7acea13b..8301498f606c 100644
--- a/docs/source/reference/distributions.rst
+++ b/docs/source/reference/distributions.rst
@@ -17,10 +17,12 @@ Distributions
    chainer.distributions.Bernoulli
    chainer.distributions.Beta
    chainer.distributions.Categorical
+   chainer.distributions.Cauchy
    chainer.distributions.Chisquare
    chainer.distributions.Dirichlet
    chainer.distributions.Exponential
    chainer.distributions.Gamma
+   chainer.distributions.Geometric
    chainer.distributions.Gumbel
    chainer.distributions.Laplace
    chainer.distributions.LogNormal
diff --git a/docs/source/reference/functions.rst b/docs/source/reference/functions.rst
index a3d03f15ed80..0741776136ab 100644
--- a/docs/source/reference/functions.rst
+++ b/docs/source/reference/functions.rst
@@ -22,11 +22,6 @@ Some functions additionally supports scalar arguments.
 
     If you are implementing your own functions, please see :doc:`../guides/functions`.
 
-.. note::
-   As of v1.5, the concept of parameterized functions are gone, and they are
-   replaced by corresponding :class:`~chainer.Link` implementations. They are
-   found in the :mod:`chainer.links` namespace.
-
 ..
    For contributors that want to update these lists:
 
diff --git a/examples/ptb/train_ptb_custom_loop.py b/examples/ptb/train_ptb_custom_loop.py
index 2b919c723482..2c13193f3a31 100755
--- a/examples/ptb/train_ptb_custom_loop.py
+++ b/examples/ptb/train_ptb_custom_loop.py
@@ -10,7 +10,6 @@
 applies an optimizer to update the model.
 """
 import argparse
-import copy
 import numpy as np
 
 import chainer
@@ -56,7 +55,8 @@ def evaluate(model, iter):
         with configuration.using_config('train', False):
             # This is optional but can reduce computational overhead.
             with chainer.using_config('enable_backprop', False):
-                for batch in copy.copy(iter):
+                iter.reset()
+                for batch in iter:
                     x, t = convert.concat_examples(batch, args.gpu)
                     loss = evaluator(x, t)
                     sum_perp += loss.array
diff --git a/examples/static_graph_optimizations/ptb/train_ptb_custom_loop.py b/examples/static_graph_optimizations/ptb/train_ptb_custom_loop.py
index 4b2199b2ec0d..4d0797c74c9b 100755
--- a/examples/static_graph_optimizations/ptb/train_ptb_custom_loop.py
+++ b/examples/static_graph_optimizations/ptb/train_ptb_custom_loop.py
@@ -22,7 +22,6 @@
 """
 from __future__ import print_function
 import argparse
-import copy
 import numpy as np
 import random
 
@@ -211,7 +210,8 @@ def evaluate(model, iter):
         labels = []
         lossfun = softmax_cross_entropy.softmax_cross_entropy
         with configuration.using_config('train', False):
-            for batch in copy.copy(iter):
+            iter.reset()
+            for batch in iter:
                 word, label = convert.concat_examples(batch, args.gpu)
                 words.append(word)
                 labels.append(label)
diff --git a/chainerx_cc/scripts/ci/jenkins/Dockerfile.template b/scripts/ci/chainerx/jenkins/Dockerfile.template
similarity index 100%
rename from chainerx_cc/scripts/ci/jenkins/Dockerfile.template
rename to scripts/ci/chainerx/jenkins/Dockerfile.template
diff --git a/chainerx_cc/scripts/ci/jenkins/clear-docker-cache.sh b/scripts/ci/chainerx/jenkins/clear-docker-cache.sh
similarity index 100%
rename from chainerx_cc/scripts/ci/jenkins/clear-docker-cache.sh
rename to scripts/ci/chainerx/jenkins/clear-docker-cache.sh
diff --git a/chainerx_cc/scripts/ci/jenkins/run.sh b/scripts/ci/chainerx/jenkins/run.sh
similarity index 97%
rename from chainerx_cc/scripts/ci/jenkins/run.sh
rename to scripts/ci/chainerx/jenkins/run.sh
index 4465b44c85d9..2efc4af3e291 100644
--- a/chainerx_cc/scripts/ci/jenkins/run.sh
+++ b/scripts/ci/chainerx/jenkins/run.sh
@@ -45,7 +45,7 @@ add_path PATH "$CONDA_DIR"/bin
 
 run_step() {
     # Runs a single step
-    bash "$REPO_DIR"/chainerx_cc/scripts/ci/run-step.sh "$@"
+    bash "$REPO_DIR"/scripts/ci/run-step.sh "$@"
 }
 
 
diff --git a/chainerx_cc/scripts/ci/jenkins/run_docker.sh b/scripts/ci/chainerx/jenkins/run_docker.sh
similarity index 88%
rename from chainerx_cc/scripts/ci/jenkins/run_docker.sh
rename to scripts/ci/chainerx/jenkins/run_docker.sh
index 4d72fca44a39..36e0ac8e168d 100644
--- a/chainerx_cc/scripts/ci/jenkins/run_docker.sh
+++ b/scripts/ci/chainerx/jenkins/run_docker.sh
@@ -28,8 +28,8 @@ container_conda_dir="$container_workspace_dir"/conda
 
 # Temporary docker build context
 context_dir="$(mktemp -d)"
-cp "$host_repo_dir"/chainerx_cc/scripts/ci/setup-ubuntu.sh "$context_dir"/
-cp "$host_repo_dir"/chainerx_cc/scripts/ci/setup-conda.sh "$context_dir"/
+cp "$host_repo_dir"/scripts/ci/chainerx/setup-ubuntu.sh "$context_dir"/
+cp "$host_repo_dir"/scripts/ci/chainerx/setup-conda.sh "$context_dir"/
 sed 's/{{{UID}}}/'"$UID"'/g' "$this_dir"/Dockerfile.template > "$context_dir"/Dockerfile
 
 
@@ -43,7 +43,7 @@ docker build \
 
 
 # Boot docker and run test commands
-test_command=(bash "$container_repo_dir"/chainerx_cc/scripts/ci/jenkins/run.sh)
+test_command=(bash "$container_repo_dir"/scripts/ci/chainerx/jenkins/run.sh)
 
 # Kill the docker container upon receiving signal
 cleanup_container() {
diff --git a/chainerx_cc/scripts/ci/setup-conda.sh b/scripts/ci/chainerx/setup-conda.sh
similarity index 100%
rename from chainerx_cc/scripts/ci/setup-conda.sh
rename to scripts/ci/chainerx/setup-conda.sh
diff --git a/chainerx_cc/scripts/ci/setup-ubuntu.sh b/scripts/ci/chainerx/setup-ubuntu.sh
similarity index 100%
rename from chainerx_cc/scripts/ci/setup-ubuntu.sh
rename to scripts/ci/chainerx/setup-ubuntu.sh
diff --git a/chainerx_cc/scripts/ci/run-step.sh b/scripts/ci/run-step.sh
similarity index 86%
rename from chainerx_cc/scripts/ci/run-step.sh
rename to scripts/ci/run-step.sh
index c7228f3d2993..2f6688631db2 100644
--- a/chainerx_cc/scripts/ci/run-step.sh
+++ b/scripts/ci/run-step.sh
@@ -13,7 +13,7 @@ mkdir -p "$(dirname "$CHAINERX_CI_BASH_ENV")"
 touch "$CHAINERX_CI_BASH_ENV"
 
 cmd='
-source "'"$REPO_DIR"'"/chainerx_cc/scripts/ci/steps.sh
+source "'"$REPO_DIR"'"/scripts/ci/steps.sh
 source "$CHAINERX_CI_BASH_ENV"
 
 set -x
diff --git a/chainerx_cc/scripts/ci/steps.sh b/scripts/ci/steps.sh
similarity index 97%
rename from chainerx_cc/scripts/ci/steps.sh
rename to scripts/ci/steps.sh
index d490d4303a07..03701c91e830 100644
--- a/chainerx_cc/scripts/ci/steps.sh
+++ b/scripts/ci/steps.sh
@@ -16,13 +16,13 @@ mkdir -p "$WORK_DIR"
 
 step_setup() {
     local this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-    source "$this_dir"/setup-ubuntu.sh
+    source "$this_dir"/chainerx/setup-ubuntu.sh
 }
 
 
 step_setup_conda() {
     local this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-    source "$this_dir"/setup-conda.sh "$DOWNLOAD_DIR"/conda "$CONDA_DIR"
+    source "$this_dir"/chainerx/setup-conda.sh "$DOWNLOAD_DIR"/conda "$CONDA_DIR"
 
     echo 'PATH="$CONDA_DIR"/bin:"$PATH"' >> "$CHAINERX_CI_BASH_ENV"
 }
diff --git a/tests/chainer_tests/distributions_tests/test_cauchy.py b/tests/chainer_tests/distributions_tests/test_cauchy.py
new file mode 100644
index 000000000000..0e47864013c5
--- /dev/null
+++ b/tests/chainer_tests/distributions_tests/test_cauchy.py
@@ -0,0 +1,113 @@
+import numpy
+
+from chainer.backends import cuda
+from chainer import distributions
+from chainer import testing
+from chainer.testing import array
+from chainer.testing import attr
+from chainer import utils
+
+
+@testing.parameterize(*testing.product({
+    'shape': [(2, 3), ()],
+    'is_variable': [True, False],
+    'sample_shape': [(3, 2), ()],
+}))
+@testing.fix_random()
+@testing.with_requires('scipy')
+class TestCauchy(testing.distribution_unittest):
+
+    scipy_onebyone = True
+
+    def setUp_configure(self):
+        from scipy import stats
+        self.dist = distributions.Cauchy
+        self.scipy_dist = stats.cauchy
+
+        self.test_targets = set(["batch_shape", "cdf", "entropy",
+                                 "event_shape", "icdf", "log_prob",
+                                 "support"])
+
+        loc = utils.force_array(
+            numpy.random.uniform(-1, 1, self.shape).astype(numpy.float32))
+        scale = utils.force_array(numpy.exp(
+            numpy.random.uniform(-1, 1, self.shape)).astype(numpy.float32))
+        self.params = {"loc": loc, "scale": scale}
+        self.scipy_params = {"loc": loc, "scale": scale}
+
+    def sample_for_test(self):
+        smp = numpy.random.normal(
+            size=self.sample_shape + self.shape).astype(numpy.float32)
+        return smp
+
+    def check_mean(self, is_gpu):
+        with testing.assert_warns(RuntimeWarning):
+            if is_gpu:
+                mean1 = self.gpu_dist.mean.data
+            else:
+                mean1 = self.cpu_dist.mean.data
+
+        if self.scipy_onebyone:
+            mean2 = []
+            for one_params in self.scipy_onebyone_params_iter():
+                mean2.append(self.scipy_dist.mean(**one_params))
+            mean2 = numpy.vstack(mean2).reshape(
+                self.shape + self.cpu_dist.event_shape)
+        else:
+            mean2 = self.scipy_dist.mean(**self.scipy_params)
+        array.assert_allclose(mean1, mean2)
+
+    def test_mean_cpu(self):
+        self.check_mean(False)
+
+    @attr.gpu
+    def test_mean_gpu(self):
+        self.check_mean(True)
+
+    def check_sample(self, is_gpu):
+        if is_gpu:
+            smp1 = self.gpu_dist.sample(
+                sample_shape=(100000,)+self.sample_shape).data
+            smp1 = cuda.to_cpu(smp1)
+        else:
+            smp1 = self.cpu_dist.sample(
+                sample_shape=(100000,)+self.sample_shape).data
+        smp2 = self.scipy_dist.rvs(
+            size=(100000,)+self.sample_shape+self.shape, **self.scipy_params)
+        testing.assert_allclose(numpy.median(smp1, axis=0),
+                                numpy.median(smp2, axis=0),
+                                atol=3e-2, rtol=3e-2)
+
+    def test_sample_cpu(self):
+        self.check_sample(False)
+
+    @attr.gpu
+    def test_sample_gpu(self):
+        self.check_sample(True)
+
+    def check_variance(self, is_gpu):
+        with testing.assert_warns(RuntimeWarning):
+            if is_gpu:
+                variance1 = self.gpu_dist.variance.data
+            else:
+                variance1 = self.cpu_dist.variance.data
+
+        if self.scipy_onebyone:
+            variance2 = []
+            for one_params in self.scipy_onebyone_params_iter():
+                variance2.append(self.scipy_dist.var(**one_params))
+            variance2 = numpy.vstack(variance2).reshape(
+                self.shape + self.cpu_dist.event_shape)
+        else:
+            variance2 = self.scipy_dist.var(**self.scipy_params)
+        array.assert_allclose(variance1, variance2)
+
+    def test_variance_cpu(self):
+        self.check_variance(False)
+
+    @attr.gpu
+    def test_variance_gpu(self):
+        self.check_variance(True)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/chainer_tests/distributions_tests/test_geometric.py b/tests/chainer_tests/distributions_tests/test_geometric.py
new file mode 100644
index 000000000000..e2d959039ce5
--- /dev/null
+++ b/tests/chainer_tests/distributions_tests/test_geometric.py
@@ -0,0 +1,39 @@
+from chainer import distributions
+from chainer import testing
+import numpy
+
+
+@testing.parameterize(*testing.product({
+    'shape': [(2, 3), ()],
+    'is_variable': [True, False],
+    'sample_shape': [(3, 2), ()],
+}))
+@testing.fix_random()
+@testing.with_requires('scipy')
+class TestGeometric(testing.distribution_unittest):
+
+    scipy_onebyone = True
+
+    def setUp_configure(self):
+        from scipy import stats
+        self.dist = distributions.Geometric
+        self.scipy_dist = stats.geom
+
+        self.test_targets = set([
+            "batch_shape", "event_shape", "log_prob", "mean", "sample",
+            "support", "variance"])
+
+        p = numpy.random.uniform(0, 1, self.shape).astype(numpy.float32)
+        self.params = {"p": p}
+        self.scipy_params = {"p": p}
+
+        self.support = 'positive integer'
+        self.continuous = False
+
+    def sample_for_test(self):
+        smp = numpy.random.randint(
+            1, 10, self.sample_shape + self.shape).astype(numpy.int32)
+        return smp
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/chainer_tests/distributions_tests/test_kldivergence.py b/tests/chainer_tests/distributions_tests/test_kldivergence.py
index 87e8b9eb98f7..15a24094255a 100644
--- a/tests/chainer_tests/distributions_tests/test_kldivergence.py
+++ b/tests/chainer_tests/distributions_tests/test_kldivergence.py
@@ -74,6 +74,11 @@ def make_gamma_dist(self, is_gpu=False):
         params = self.encode_params({"k": k, "theta": theta}, is_gpu)
         return distributions.Gamma(**params)
 
+    def make_geometric_dist(self, is_gpu=False):
+        p = numpy.random.uniform(0, 1, self.shape).astype(numpy.float32)
+        params = self.encode_params({"p": p}, is_gpu)
+        return distributions.Geometric(**params)
+
     def make_gumbel_dist(self, is_gpu=False):
         loc = numpy.random.uniform(-1, 1, self.shape).astype(numpy.float32)
         scale = numpy.exp(
@@ -229,6 +234,17 @@ def test_gamma_gamma_gpu(self):
         dist2 = self.make_gamma_dist(True)
         self.check_kl(dist1, dist2)
 
+    def test_geometric_geometric_cpu(self):
+        dist1 = self.make_geometric_dist()
+        dist2 = self.make_geometric_dist()
+        self.check_kl(dist1, dist2)
+
+    @attr.gpu
+    def test_geometric_geometric_gpu(self):
+        dist1 = self.make_geometric_dist(True)
+        dist2 = self.make_geometric_dist(True)
+        self.check_kl(dist1, dist2)
+
     @testing.with_requires('scipy')
     def test_gumbel_gumbel_cpu(self):
         dist1 = self.make_gumbel_dist()
diff --git a/tests/chainer_tests/functions_tests/array_tests/test_spatial_transformer_sampler.py b/tests/chainer_tests/functions_tests/array_tests/test_spatial_transformer_sampler.py
index fd49ffc5e591..b98c3f12ac0c 100644
--- a/tests/chainer_tests/functions_tests/array_tests/test_spatial_transformer_sampler.py
+++ b/tests/chainer_tests/functions_tests/array_tests/test_spatial_transformer_sampler.py
@@ -42,6 +42,7 @@ def _rotate_BCHW(x):
 
 
 @testing.parameterize(*testing.product({
+    'dtype': [numpy.float16, numpy.float32, numpy.float64],
     'use_cudnn': ['always', 'never'],
 }))
 class TestSpatialTransformerSampler(unittest.TestCase):
@@ -52,11 +53,11 @@ class TestSpatialTransformerSampler(unittest.TestCase):
 
     def setUp(self):
         self.x = numpy.random.uniform(
-            size=self.in_shape).astype(numpy.float32)
+            size=self.in_shape).astype(self.dtype)
         self.grid = numpy.random.uniform(
-            low=-2., high=2., size=self.grid_shape).astype(numpy.float32)
+            low=-2., high=2., size=self.grid_shape).astype(self.dtype)
         self.grads = numpy.random.uniform(
-            size=self.out_shape).astype(numpy.float32)
+            size=self.out_shape).astype(self.dtype)
 
     def check_forward(self, x, grid):
         y = functions.spatial_transformer_sampler(x, grid)
@@ -90,6 +91,9 @@ def test_backward_gpu(self):
                                 cuda.to_gpu(self.grads))
 
 
+@testing.parameterize(*testing.product({
+    'dtype': [numpy.float16, numpy.float32, numpy.float64],
+}))
 class TestSpatialTransformerSamplerConsistencyWithCuDNN(unittest.TestCase):
 
     in_shape = (2, 2, 4, 4)
@@ -97,12 +101,16 @@ class TestSpatialTransformerSamplerConsistencyWithCuDNN(unittest.TestCase):
     grid_shape = (2, 2, 3, 3)
 
     def setUp(self):
-        self.x = numpy.random.uniform(
-            size=self.in_shape).astype(numpy.float32)
+        self.x = numpy.random.uniform(size=self.in_shape).astype(self.dtype)
         self.grid = numpy.random.uniform(
-            low=-2, high=2, size=self.grid_shape).astype(numpy.float32)
+            low=-2, high=2, size=self.grid_shape).astype(self.dtype)
         self.grads = numpy.random.uniform(
-            size=self.out_shape).astype(numpy.float32)
+            size=self.out_shape).astype(self.dtype)
+
+        if self.dtype == numpy.float16:
+            self.assert_options = {'atol': 1e-2}
+        else:
+            self.assert_options = {}
 
     def _apply_backward(self, x, grid, grads):
         x = Variable(x)
@@ -125,9 +133,12 @@ def test_consistency_with_cudnn_cpu(self):
                 cuda.to_gpu(self.x), cuda.to_gpu(self.grid),
                 cuda.to_gpu(self.grads))
 
-        testing.assert_allclose(y_cpu.data, y_cudnn.data)
-        testing.assert_allclose(x_cpu.grad, x_cudnn.grad)
-        testing.assert_allclose(grid_cpu.grad, grid_cudnn.grad)
+        testing.assert_allclose(
+            y_cpu.data, y_cudnn.data, **self.assert_options)
+        testing.assert_allclose(
+            x_cpu.grad, x_cudnn.grad, **self.assert_options)
+        testing.assert_allclose(
+            grid_cpu.grad, grid_cudnn.grad, **self.assert_options)
 
     @attr.gpu
     @attr.cudnn
@@ -141,9 +152,12 @@ def test_consistency_with_cudnn_gpu(self):
                 cuda.to_gpu(self.x), cuda.to_gpu(self.grid),
                 cuda.to_gpu(self.grads))
 
-        testing.assert_allclose(y_gpu.data, y_cudnn.data)
-        testing.assert_allclose(x_gpu.grad, x_cudnn.grad)
-        testing.assert_allclose(grid_gpu.grad, grid_cudnn.grad)
+        testing.assert_allclose(
+            y_gpu.data, y_cudnn.data, **self.assert_options)
+        testing.assert_allclose(
+            x_gpu.grad, x_cudnn.grad, **self.assert_options)
+        testing.assert_allclose(
+            grid_gpu.grad, grid_cudnn.grad, **self.assert_options)
 
 
 @testing.parameterize(
diff --git a/tests/chainer_tests/functions_tests/connection_tests/test_n_step_gru.py b/tests/chainer_tests/functions_tests/connection_tests/test_n_step_gru.py
index f592e28c80dc..f6b2ffdc490e 100644
--- a/tests/chainer_tests/functions_tests/connection_tests/test_n_step_gru.py
+++ b/tests/chainer_tests/functions_tests/connection_tests/test_n_step_gru.py
@@ -179,7 +179,7 @@ def call_forward(self, train):
     def check_call_cudnn_forward_training(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardTraining') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_training') as func:
                 self.call_forward(True)
             assert func.called == expect
 
@@ -192,7 +192,7 @@ def test_call_cudnn_forward_training(self):
     def check_call_cudnn_forward_inference(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardInference') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_inference') as func:
                 self.call_forward(False)
             assert func.called == expect
 
@@ -207,7 +207,7 @@ def check_call_cudnn_backward(self, use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
             hy, ys = self.call_forward(True)
             hy.grad = _to_gpu(self.dhy)
-            with testing.patch('cupy.cuda.cudnn.RNNBackwardWeights') as func:
+            with testing.patch('cupy.cudnn.rnn_backward_weights') as func:
                 hy.backward()
             assert func.called == expect
 
@@ -395,7 +395,7 @@ def call_forward(self, train):
     def check_call_cudnn_forward_training(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardTraining') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_training') as func:
                 self.call_forward(True)
             assert func.called == expect
 
@@ -408,7 +408,7 @@ def test_call_cudnn_forward_training(self):
     def check_call_cudnn_forward_inference(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardInference') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_inference') as func:
                 self.call_forward(False)
             assert func.called == expect
 
@@ -423,7 +423,7 @@ def check_call_cudnn_backward(self, use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
             hy, ys = self.call_forward(True)
             hy.grad = _to_gpu(self.dhy)
-            with testing.patch('cupy.cuda.cudnn.RNNBackwardWeights') as func:
+            with testing.patch('cupy.cudnn.rnn_backward_weights') as func:
                 hy.backward()
             assert func.called == expect
 
diff --git a/tests/chainer_tests/functions_tests/connection_tests/test_n_step_lstm.py b/tests/chainer_tests/functions_tests/connection_tests/test_n_step_lstm.py
index d0feab03fc30..b9f7abb968c7 100644
--- a/tests/chainer_tests/functions_tests/connection_tests/test_n_step_lstm.py
+++ b/tests/chainer_tests/functions_tests/connection_tests/test_n_step_lstm.py
@@ -191,7 +191,7 @@ def call_forward(self, train):
     def check_call_cudnn_forward_training(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardTraining') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_training') as func:
                 self.call_forward(True)
             assert func.called == expect
 
@@ -204,7 +204,7 @@ def test_call_cudnn_forward_training(self):
     def check_call_cudnn_forward_inference(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardInference') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_inference') as func:
                 self.call_forward(False)
             assert func.called == expect
 
@@ -219,7 +219,7 @@ def check_call_cudnn_backward(self, use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
             hy, cy, ys = self.call_forward(True)
             hy.grad = _to_gpu(self.dhy)
-            with testing.patch('cupy.cuda.cudnn.RNNBackwardWeights') as func:
+            with testing.patch('cupy.cudnn.rnn_backward_weights') as func:
                 hy.backward()
             assert func.called == expect
 
@@ -421,7 +421,7 @@ def call_forward(self, train):
     def check_call_cudnn_forward_training(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardTraining') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_training') as func:
                 self.call_forward(True)
             assert func.called == expect
 
@@ -434,7 +434,7 @@ def test_call_cudnn_forward_training(self):
     def check_call_cudnn_forward_inference(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardInference') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_inference') as func:
                 self.call_forward(False)
             assert func.called == expect
 
@@ -449,7 +449,7 @@ def check_call_cudnn_backward(self, use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
             hy, cy, ys = self.call_forward(True)
             hy.grad = _to_gpu(self.dhy)
-            with testing.patch('cupy.cuda.cudnn.RNNBackwardWeights') as func:
+            with testing.patch('cupy.cudnn.rnn_backward_weights') as func:
                 hy.backward()
             assert func.called == expect
 
diff --git a/tests/chainer_tests/functions_tests/connection_tests/test_n_step_rnn.py b/tests/chainer_tests/functions_tests/connection_tests/test_n_step_rnn.py
index 350bc6f52d4a..ecb8eb0aa1de 100644
--- a/tests/chainer_tests/functions_tests/connection_tests/test_n_step_rnn.py
+++ b/tests/chainer_tests/functions_tests/connection_tests/test_n_step_rnn.py
@@ -204,7 +204,7 @@ def call_forward(self, train):
     def check_call_cudnn_forward_training(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardTraining') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_training') as func:
                 self.call_forward(True)
             assert func.called == expect
 
@@ -217,7 +217,7 @@ def test_call_cudnn_forward_training(self):
     def check_call_cudnn_forward_inference(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardInference') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_inference') as func:
                 self.call_forward(False)
             assert func.called == expect
 
@@ -232,7 +232,7 @@ def check_call_cudnn_backward(self, use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
             hy, ys = self.call_forward(True)
             hy.grad = _to_gpu(self.dhy)
-            with testing.patch('cupy.cuda.cudnn.RNNBackwardWeights') as func:
+            with testing.patch('cupy.cudnn.rnn_backward_weights') as func:
                 hy.backward()
             assert func.called == expect
 
@@ -432,7 +432,7 @@ def call_forward(self, train):
     def check_call_cudnn_forward_training(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardTraining') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_training') as func:
                 self.call_forward(True)
             assert func.called == expect
 
@@ -445,7 +445,7 @@ def test_call_cudnn_forward_training(self):
     def check_call_cudnn_forward_inference(self, use_cudnn):
         with chainer.using_config('use_cudnn', use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
-            with testing.patch('cupy.cuda.cudnn.RNNForwardInference') as func:
+            with testing.patch('cupy.cudnn.rnn_forward_inference') as func:
                 self.call_forward(False)
             assert func.called == expect
 
@@ -460,7 +460,7 @@ def check_call_cudnn_backward(self, use_cudnn):
             expect = chainer.should_use_cudnn('>=auto', 5000)
             hy, ys = self.call_forward(True)
             hy.grad = _to_gpu(self.dhy)
-            with testing.patch('cupy.cuda.cudnn.RNNBackwardWeights') as func:
+            with testing.patch('cupy.cudnn.rnn_backward_weights') as func:
                 hy.backward()
             assert func.called == expect
 
diff --git a/tests/chainer_tests/functions_tests/math_tests/test_clip.py b/tests/chainer_tests/functions_tests/math_tests/test_clip.py
index 4e63734115f5..c387547ac8e2 100644
--- a/tests/chainer_tests/functions_tests/math_tests/test_clip.py
+++ b/tests/chainer_tests/functions_tests/math_tests/test_clip.py
@@ -13,34 +13,42 @@
 @testing.parameterize(*testing.product({
     'shape': [(3, 2), ()],
     'dtype': [numpy.float16, numpy.float32, numpy.float64],
+    'x_min_max': [
+        (-0.75, 1.53),
+        (numpy.float32(-0.75), numpy.float32(1.53)),
+        (-1, 2),
+    ]
 }))
 class TestClip(unittest.TestCase):
 
     def setUp(self):
-        self.x = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.x = numpy.random.uniform(-3, 3, self.shape).astype(self.dtype)
         # Avoid values around x_min and x_max for stability of numerical
         # gradient
+        x_min, x_max = self.x_min_max
+        x_min = float(x_min)
+        x_max = float(x_max)
+        eps = 0.01
         for ind in numpy.ndindex(self.x.shape):
-            if -0.76 < self.x[ind] < -0.74:
+            if x_min - eps < self.x[ind] < x_min + eps:
                 self.x[ind] = -0.5
-            elif 0.74 < self.x[ind] < 0.76:
+            elif x_max - eps < self.x[ind] < x_max + eps:
                 self.x[ind] = 0.5
         self.gy = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype)
         self.ggx = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype)
-        self.x_min = -0.75
-        self.x_max = 0.75
 
     def check_forward(self, x_data):
+        x_min, x_max = self.x_min_max
         x = chainer.Variable(x_data)
-        y = functions.clip(x, self.x_min, self.x_max)
+        y = functions.clip(x, x_min, x_max)
         self.assertEqual(y.data.dtype, self.dtype)
 
         y_expect = self.x.copy()
         for i in numpy.ndindex(self.x.shape):
-            if self.x[i] < self.x_min:
-                y_expect[i] = self.x_min
-            elif self.x[i] > self.x_max:
-                y_expect[i] = self.x_max
+            if self.x[i] < x_min:
+                y_expect[i] = x_min
+            elif self.x[i] > x_max:
+                y_expect[i] = x_max
 
         testing.assert_allclose(y_expect, y.data)
 
@@ -53,7 +61,8 @@ def test_forward_gpu(self):
 
     def check_backward(self, x_data, y_grad):
         def f(x):
-            return functions.clip(x, self.x_min, self.x_max)
+            x_min, x_max = self.x_min_max
+            return functions.clip(x, x_min, x_max)
 
         gradient_check.check_backward(
             f, x_data, y_grad, dtype=numpy.float64)
@@ -67,7 +76,8 @@ def test_backward_gpu(self):
 
     def check_double_backward(self, x_data, y_grad, gx_grad):
         def f(x):
-            return functions.clip(x, self.x_min, self.x_max)
+            x_min, x_max = self.x_min_max
+            return functions.clip(x, x_min, x_max)
 
         gradient_check.check_double_backward(
             f, x_data, y_grad, gx_grad, dtype=numpy.float64, atol=1e-3)
@@ -87,7 +97,7 @@ def setUp(self):
         self.x = numpy.random.uniform(-1, 1, (3, 2)).astype(numpy.float32)
 
     def test_invalid_interval(self):
-        with self.assertRaises(AssertionError):
+        with self.assertRaises(ValueError):
             functions.clip(self.x, 1.0, -1.0)
 
 
diff --git a/tests/chainer_tests/functions_tests/normalization_tests/test_batch_normalization.py b/tests/chainer_tests/functions_tests/normalization_tests/test_batch_normalization.py
index eba965aa9a9e..2477bcfd6223 100644
--- a/tests/chainer_tests/functions_tests/normalization_tests/test_batch_normalization.py
+++ b/tests/chainer_tests/functions_tests/normalization_tests/test_batch_normalization.py
@@ -449,7 +449,7 @@ def forward(self):
     def test_call_cudnn_forward(self):
         with chainer.using_config('use_cudnn', self.use_cudnn):
             with testing.patch(
-                    'cupy.cuda.cudnn.batchNormalizationForwardTraining'
+                    'cupy.cudnn.batch_normalization_forward_training'
             ) as func:
                 self.forward()
                 self.assertEqual(func.called, self.expect)
@@ -459,7 +459,7 @@ def test_call_cudnn_backward(self):
             y = self.forward()
             y.grad = self.gy
             with testing.patch(
-                    'cupy.cuda.cudnn.batchNormalizationBackward'
+                    'cupy.cudnn.batch_normalization_backward'
             ) as func:
                 y.backward()
                 self.assertEqual(func.called, self.expect)
diff --git a/tests/chainer_tests/test_variable.py b/tests/chainer_tests/test_variable.py
index c9dad49323d5..b301e523c54a 100644
--- a/tests/chainer_tests/test_variable.py
+++ b/tests/chainer_tests/test_variable.py
@@ -4,6 +4,7 @@
 import re
 import sys
 import unittest
+import warnings
 
 import mock
 import numpy as np
@@ -2070,13 +2071,15 @@ class TestVariableBackwardError(unittest.TestCase):
     def setUp(self):
         self.x = np.array([1], np.float32)
 
-    def check_type_mismatch(self, x_data):
+    def check_type_mismatch(self, x_data, retain):
         xp = backend.get_array_module(x_data)
 
         class DummyFunction(chainer.Function):
             label = 'dummy_function'
 
             def forward(self, inputs):
+                if not retain:
+                    self.retain_inputs(())
                 return xp.array(1, np.float32),
 
             def backward(self, inputs, grads):
@@ -2088,19 +2091,28 @@ def backward(self, inputs, grads):
             y.backward()
 
     def test_type_mismatch_cpu(self):
-        self.check_type_mismatch(self.x)
+        self.check_type_mismatch(self.x, True)
+
+    def test_type_mismatch_unretain_cpu(self):
+        self.check_type_mismatch(self.x, False)
 
     @attr.gpu
     def test_type_mismatch_gpu(self):
-        self.check_type_mismatch(cuda.to_gpu(self.x))
+        self.check_type_mismatch(cuda.to_gpu(self.x), True)
+
+    @attr.gpu
+    def test_type_mismatch_unretain_gpu(self):
+        self.check_type_mismatch(cuda.to_gpu(self.x), False)
 
-    def check_dtype_mismatch(self, x_data):
+    def check_dtype_mismatch(self, x_data, retain):
         xp = backend.get_array_module(x_data)
 
         class DummyFunction(chainer.Function):
             label = 'dummy_function'
 
             def forward(self, inputs):
+                if not retain:
+                    self.retain_inputs(())
                 return xp.array(1, np.float32),
 
             def backward(self, inputs, grads):
@@ -2112,19 +2124,28 @@ def backward(self, inputs, grads):
             y.backward()
 
     def test_dtype_mismatch_cpu(self):
-        self.check_dtype_mismatch(self.x)
+        self.check_dtype_mismatch(self.x, True)
+
+    def test_dtype_mismatch_unretain_cpu(self):
+        self.check_dtype_mismatch(self.x, False)
 
     @attr.gpu
     def test_dtype_mismatch_gpu(self):
-        self.check_dtype_mismatch(cuda.to_gpu(self.x))
+        self.check_dtype_mismatch(cuda.to_gpu(self.x), True)
 
-    def check_shape_mismatch(self, x_data):
+    @attr.gpu
+    def test_dtype_mismatch_unretain_gpu(self):
+        self.check_dtype_mismatch(cuda.to_gpu(self.x), False)
+
+    def check_shape_mismatch(self, x_data, retain):
         xp = backend.get_array_module(x_data)
 
         class DummyFunction(chainer.Function):
             label = 'dummy_function'
 
             def forward(self, inputs):
+                if not retain:
+                    self.retain_inputs(())
                 return xp.array(1, np.float32),
 
             def backward(self, inputs, grads):
@@ -2136,11 +2157,18 @@ def backward(self, inputs, grads):
             y.backward()
 
     def test_shape_mismatch_cpu(self):
-        self.check_shape_mismatch(self.x)
+        self.check_shape_mismatch(self.x, True)
+
+    def test_shape_mismatch_unretain_cpu(self):
+        self.check_shape_mismatch(self.x, False)
 
     @attr.gpu
     def test_shape_mismatch_gpu(self):
-        self.check_shape_mismatch(cuda.to_gpu(self.x))
+        self.check_shape_mismatch(cuda.to_gpu(self.x), True)
+
+    @attr.gpu
+    def test_shape_mismatch_unretain_gpu(self):
+        self.check_shape_mismatch(cuda.to_gpu(self.x), False)
 
 
 class TestVariableBackwardErrorTraceback(unittest.TestCase):
@@ -2548,7 +2576,7 @@ class TestVariableDoubleBackward(unittest.TestCase):
 
     def test_default_backward(self):
         x = chainer.Variable(np.empty((), np.float32))
-        y = x * 2
+        y = x * 2  # x.grad_var will be different from y.grad_var
         y.backward()
         assert x.grad_var is not y.grad_var
         assert x.grad_var.creator is None
@@ -2592,11 +2620,14 @@ class TestVariableDoubleBackwardOneElementScalar(unittest.TestCase):
 
     def test_default_backward(self):
         x = chainer.Variable(np.empty(1, np.float32))
-        y = F.identity(x)
+        y = x * 2  # x.grad_var will be different from y.grad_var
         with testing.assert_warns(DeprecationWarning):
             y.backward()
         assert x.grad_var.creator is None
-        x.grad_var.backward()
+        with warnings.catch_warnings():
+            # ok to be warned that x.grad_var is old-styled scalar
+            warnings.simplefilter('ignore', DeprecationWarning)
+            x.grad_var.backward()
         assert y.grad_var.grad_var is None
 
     def test_raise_double_backprop(self):
@@ -2605,7 +2636,10 @@ def test_raise_double_backprop(self):
         with testing.assert_warns(DeprecationWarning):
             y.backward(enable_double_backprop=True)
         with pytest.raises(RuntimeError):
-            x.grad_var.backward()
+            with warnings.catch_warnings():
+                # ok to be warned that x.grad_var is old-styled scalar
+                warnings.simplefilter('ignore', DeprecationWarning)
+                x.grad_var.backward()
 
     def test_raise_double_backprop_2(self):
         x = chainer.Variable(np.empty(1, np.float32))
@@ -2614,7 +2648,10 @@ def test_raise_double_backprop_2(self):
         with testing.assert_warns(DeprecationWarning):
             y.backward(enable_double_backprop=True)
         with pytest.raises(RuntimeError):
-            x.grad_var.backward()
+            with warnings.catch_warnings():
+                # ok to be warned that x.grad_var is old-styled scalar
+                warnings.simplefilter('ignore', DeprecationWarning)
+                x.grad_var.backward()
 
     def test_grad_raise_double_backprop(self):
         x = chainer.Variable(np.empty(1, np.float32))