diff --git a/chainer/functions/activation/tree_lstm.py b/chainer/functions/activation/tree_lstm.py
index 1ede35fa89f9..0b96dcebd98f 100644
--- a/chainer/functions/activation/tree_lstm.py
+++ b/chainer/functions/activation/tree_lstm.py
@@ -198,7 +198,7 @@ def tree_lstm(*inputs):
     This function implements TreeLSTM units both for
     N-ary TreeLSTM and Child-Sum TreeLSTM.
     Let the children cell states
-    :math:`c_{\\text{1}}, c_{\\text{2}}, \dots, c_{\\text{N}}`,
+    :math:`c_{\\text{1}}, c_{\\text{2}}, \\dots, c_{\\text{N}}`,
     and the incoming signal :math:`x`.
 
     First, the incoming signal :math:`x` is split into (3 + N) arrays
diff --git a/chainer/functions/connection/convolution_nd.py b/chainer/functions/connection/convolution_nd.py
index 06a08b341abf..45e1ccb7b7a3 100644
--- a/chainer/functions/connection/convolution_nd.py
+++ b/chainer/functions/connection/convolution_nd.py
@@ -302,7 +302,7 @@ def convolution_nd(x, W, b=None, stride=1, pad=0, cover_all=False):
 
     .. math::
 
-       l_n = (d_n + 2p_n - k_n) / s_n + 1 \ \ (n = 1, ..., N)
+       l_n = (d_n + 2p_n - k_n) / s_n + 1 \\ \\ (n = 1, ..., N)
 
     If ``cover_all`` option is ``True``, the filter will cover the all
     spatial locations. So, if the last stride of filter does not cover the
@@ -312,7 +312,7 @@ def convolution_nd(x, W, b=None, stride=1, pad=0, cover_all=False):
 
     .. math::
 
-       l_n = (d_n + 2p_n - k_n + s_n - 1) / s_n + 1 \ \ (n = 1, ..., N)
+       l_n = (d_n + 2p_n - k_n + s_n - 1) / s_n + 1 \\ \\ (n = 1, ..., N)
 
     The N-dimensional convolution function is defined as follows.
 
diff --git a/chainer/functions/connection/deconvolution_nd.py b/chainer/functions/connection/deconvolution_nd.py
index 8bc4a3042ffa..6dd937556780 100644
--- a/chainer/functions/connection/deconvolution_nd.py
+++ b/chainer/functions/connection/deconvolution_nd.py
@@ -313,7 +313,7 @@ def deconvolution_nd(x, W, b=None, stride=1, pad=0, outsize=None):
 
     .. math::
 
-       l_n = s_n (d_n - 1)  + k_n - 2 p_n \ \ (n = 1, ..., N)
+       l_n = s_n (d_n - 1)  + k_n - 2 p_n \\ \\ (n = 1, ..., N)
 
     If ``outsize`` option is given, the output size is determined by
     ``outsize``. In this case, the ``outsize`` :math:`(l_1, l_2, ..., l_N)`
@@ -321,7 +321,8 @@ def deconvolution_nd(x, W, b=None, stride=1, pad=0, outsize=None):
 
     .. math::
 
-       d_n = \\lfloor (l_n + 2p_n - k_n) / s_n \\rfloor + 1 \ \ (n = 1, ..., N)
+       d_n = \\lfloor (l_n + 2p_n - k_n) / s_n \\rfloor + 1 \\ \\ \
+       (n = 1, ..., N)
 
     Args:
         x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
diff --git a/chainer/functions/loss/vae.py b/chainer/functions/loss/vae.py
index 2b4b2c6a707e..7ca0bdb19cd0 100644
--- a/chainer/functions/loss/vae.py
+++ b/chainer/functions/loss/vae.py
@@ -65,7 +65,8 @@ def bernoulli_nll(x, y, reduce='sum'):
 
     .. math::
 
-        -\\log B(x; p) = -\\sum_i \{x_i \\log(p_i) + (1 - x_i)\\log(1 - p_i)\},
+        -\\log B(x; p) = -\\sum_i \\{x_i \\log(p_i) + \
+        (1 - x_i)\\log(1 - p_i)\\},
 
     where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid
     function, and :math:`B(x; p)` is a Bernoulli distribution.
diff --git a/chainer/training/extensions/parameter_statistics.py b/chainer/training/extensions/parameter_statistics.py
index 77e606c49246..3d419e3499b8 100644
--- a/chainer/training/extensions/parameter_statistics.py
+++ b/chainer/training/extensions/parameter_statistics.py
@@ -10,8 +10,8 @@ class ParameterStatistics(extension.Extension):
     """Trainer extension to report parameter statistics.
 
     Statistics are collected and reported for a given :class:`~chainer.Link`
-    or an iterable of :class:`~chainer.Link`\s. If a link contains child links,
-    the statistics are reported separately for each child.
+    or an iterable of :class:`~chainer.Link`\\ s. If a link contains child
+    links, the statistics are reported separately for each child.
 
     Any function that takes a one-dimensional :class:`numpy.ndarray` or a
     :class:`cupy.ndarray` and outputs a single or multiple real numbers can be