diff --git a/chainer/computational_graph.py b/chainer/computational_graph.py
index c88c91b3bfef..85b150978aff 100644
--- a/chainer/computational_graph.py
+++ b/chainer/computational_graph.py
@@ -115,11 +115,11 @@ def _to_dot(self):
     def dump(self, format='dot'):
         """Dumps graph as a text.
 
-        Args
+        Args:
             format(str): The graph language name of the output.
             Currently, it must be 'dot'.
 
-        Returns
+        Returns:
             str: The graph in specified format.
 
         """
diff --git a/chainer/functions/activation/hard_sigmoid.py b/chainer/functions/activation/hard_sigmoid.py
index 94b39096bddb..693d1e2c9d8b 100644
--- a/chainer/functions/activation/hard_sigmoid.py
+++ b/chainer/functions/activation/hard_sigmoid.py
@@ -8,7 +8,7 @@
 
 class HardSigmoid(function.Function):
 
-    """Hard-sigmoid funciton."""
+    """Hard-sigmoid function."""
 
     def check_type_forward(self, in_types):
         type_check.expect(in_types.size() == 1)
diff --git a/chainer/functions/activation/lstm.py b/chainer/functions/activation/lstm.py
index d1ff1a1d3f50..012850eb6c8b 100644
--- a/chainer/functions/activation/lstm.py
+++ b/chainer/functions/activation/lstm.py
@@ -180,9 +180,9 @@ def lstm(c_prev, x):
     This function supports variable length inputs. The mini-batch size of
     the current input must be equal to or smaller than that of the previous
     one. When mini-batch size of ``x`` is smaller than that of ``c``, this
-    funciton only updates ``c[0:len(x)]`` and doesn't change the rest of ``c``,
+    function only updates ``c[0:len(x)]`` and doesn't change the rest of ``c``,
     ``c[len(x):]``.
-    So, please sort input sequneces in descending order of lengths before
+    So, please sort input sequences in descending order of lengths before
     applying the function.
 
     Args:
diff --git a/chainer/functions/activation/prelu.py b/chainer/functions/activation/prelu.py
index b206199e26e2..fbd25a986ef6 100644
--- a/chainer/functions/activation/prelu.py
+++ b/chainer/functions/activation/prelu.py
@@ -88,7 +88,7 @@ def prelu(x, W):
 
     For example :math:`W` has the shape of :math:`(2, 3, 4)`,
     :math:`x` must have the shape of :math:`(B, 2, 3, 4, S1, ..., SN)`
-    where B is batchsize and the number of trailing S's
+    where B is batch size and the number of trailing S's
     is arbitrary non-negative integer.
 
     Args:
diff --git a/chainer/functions/array/separate.py b/chainer/functions/array/separate.py
index 3421088539af..ebafa155cc90 100644
--- a/chainer/functions/array/separate.py
+++ b/chainer/functions/array/separate.py
@@ -13,7 +13,7 @@ def separate(x, axis=0):
 
     Args:
         x (chainer.Variable): Variable to be separated.
-        axis (int): Axis alog which variables are separated.
+        axis (int): Axis along which variables are separated.
 
     Returns:
         tuple of chainer.Variable: Output variables.
diff --git a/chainer/functions/loss/contrastive.py b/chainer/functions/loss/contrastive.py
index 2cfaf5573d09..94c50945a7bd 100644
--- a/chainer/functions/loss/contrastive.py
+++ b/chainer/functions/loss/contrastive.py
@@ -70,7 +70,7 @@ def contrastive(x0, x1, y, margin=1):
 
     It takes a pair of variables and a label as inputs. The label is 1 when
     those two input variables are similar, or 0 when they are dissimilar. Let
-    :math:`N` and :math:`K` denote mini-batchsize and the dimension of input
+    :math:`N` and :math:`K` denote mini-batch size and the dimension of input
     variables, respectively. The shape of both input variables should be
     ``(N, K)``.
 
@@ -86,12 +86,12 @@ def contrastive(x0, x1, y, margin=1):
 
     Args:
         x0 (~chainer.Variable): The first input variable. The shape should be
-            (N, K), where N denotes the minibatch size, and K denotes the
+            (N, K), where N denotes the mini-batch size, and K denotes the
             dimension of x0.
         x1 (~chainer.Variable): The second input variable. The shape should be
             the same as x0.
         y (~chainer.Variable): Labels. All values should be 0 or 1. The shape
-            should be ``(N,)``, where N denotes the minibatch size.
+            should be ``(N,)``, where N denotes the mini-batch size.
         margin (float): A parameter for contrastive loss. It should be positive
             value.
 
diff --git a/chainer/functions/loss/hinge.py b/chainer/functions/loss/hinge.py
index 47b17839d44c..d6eb2a08a1d8 100644
--- a/chainer/functions/loss/hinge.py
+++ b/chainer/functions/loss/hinge.py
@@ -96,7 +96,7 @@ def hinge(x, t, norm='L1'):
             L = \\frac{1}{N} \\sum_{n=1}^N \\sum_{k=1}^K \\left[
             \\max(0, 1 - \\delta\\{l_n = k\\} t_{nk}) \\right]^p
 
-        where :math:`N` denotes the batchsize, :math:`K` is the number of
+        where :math:`N` denotes the batch size, :math:`K` is the number of
         classes of interest,
 
         .. math::
diff --git a/chainer/functions/loss/triplet.py b/chainer/functions/loss/triplet.py
index f8f8a00dca7b..ecd7d3a848ae 100644
--- a/chainer/functions/loss/triplet.py
+++ b/chainer/functions/loss/triplet.py
@@ -64,7 +64,7 @@ def triplet(anchor, positive, negative, margin=0.2):
     It takes a triplet of variables as inputs, :math:`a`, :math:`p` and
     :math:`n`: anchor, positive example and negative example respectively.
     The triplet defines a relative similarity between samples.
-    Let :math:`N` and :math:`K` denote mini-batchsize and the dimension of
+    Let :math:`N` and :math:`K` denote mini-batch size and the dimension of
     input variables, respectively. The shape of all input variables should be
     :math:`(N, K)`.
 
diff --git a/chainer/functions/normalization/l2_normalization.py b/chainer/functions/normalization/l2_normalization.py
index 0f3f8611dc04..61733c4715dd 100644
--- a/chainer/functions/normalization/l2_normalization.py
+++ b/chainer/functions/normalization/l2_normalization.py
@@ -92,7 +92,7 @@ def normalize(x, eps=1e-5):
 
     This function implements L2 normalization on a 1D vector. No reduction
     is done along batch axis.  Let :math:`x` be an input vector of dimension
-    :math:`(N, K)`, where :math:`N` and :math:`K` denote mini-batchsize and the
+    :math:`(N, K)`, where :math:`N` and :math:`K` denote mini-batch size and the
     dimension of the input variable. Then, this function computes an output
     vector :math:`y` by the following equation:
 
diff --git a/chainer/links/connection/lstm.py b/chainer/links/connection/lstm.py
index 3745532130a5..e0f9dddbb66b 100644
--- a/chainer/links/connection/lstm.py
+++ b/chainer/links/connection/lstm.py
@@ -100,7 +100,7 @@ class LSTM(LSTMBase):
     When mini-batch size of ``i``-th input is smaller than that of the previous
     input, this link only updates ``c[0:len(x)]`` and ``h[0:len(x)]`` and
     doesn't change the rest of ``c`` and ``h``.
-    So, please sort input sequneces in descending order of lengths before
+    So, please sort input sequences in descending order of lengths before
     applying the function.
 
     Args:
diff --git a/chainer/links/normalization/batch_normalization.py b/chainer/links/normalization/batch_normalization.py
index bc2d484b868c..f1ca32c7f054 100644
--- a/chainer/links/normalization/batch_normalization.py
+++ b/chainer/links/normalization/batch_normalization.py
@@ -21,7 +21,7 @@ class BatchNormalization(link.Link):
 
     In fine-tuning mode, it accumulates the input to compute *population
     statistics*. In order to correctly compute the population statistics, a
-    user must use this mode to feed mini batches running through whole training
+    user must use this mode to feed mini-batches running through whole training
     dataset.
 
     In testing mode, it uses pre-computed population statistics to normalize
diff --git a/chainer/training/extensions/print_report.py b/chainer/training/extensions/print_report.py
index 301415fe89ec..e7c3df3b4d6c 100644
--- a/chainer/training/extensions/print_report.py
+++ b/chainer/training/extensions/print_report.py
@@ -16,7 +16,7 @@ class PrintReport(extension.Extension):
         log_report (str or LogReport): Log report to accumulate the
             observations. This is either the name of a LogReport extensions
             registered to the trainer, or a LogReport instance to use
-            interanlly.
+            internally.
         out: Stream to print the bar. Standard output is used by default.
 
     """
diff --git a/docs/source/contribution.rst b/docs/source/contribution.rst
index cd6583922287..738553df2f5e 100644
--- a/docs/source/contribution.rst
+++ b/docs/source/contribution.rst
@@ -100,11 +100,11 @@ To check your code, use ``autopep8`` and ``flake8`` command installed by ``hacki
   $ autopep8 --global-config .pep8 path/to/your/code.py
   $ flake8 path/to/your/code.py
 
-To check Cython code, use ``.flake8.cython`` config file::
+To check Cython code, use ``.flake8.cython`` configuration file::
 
   $ flake8 --config=.flake8.cython path/to/your/cython/code.pyx
 
-The autopep8 supports automatically correct Python code to conform to the PEP 8 style guide::
+The ``autopep8`` supports automatically correct Python code to conform to the PEP 8 style guide::
 
   $ autopep8 --in-place --global-config .pep8 path/to/your/code.py
 
diff --git a/docs/source/reference/iterators.rst b/docs/source/reference/iterators.rst
index a8041cc13728..60b2fa1c0306 100644
--- a/docs/source/reference/iterators.rst
+++ b/docs/source/reference/iterators.rst
@@ -5,9 +5,9 @@
 Iterator examples
 =================
 
-Chainer provides some iterators that implement typical strategies to create minibatches by iterating over datasets.
+Chainer provides some iterators that implement typical strategies to create mini-batches by iterating over datasets.
 :class:`SerialIterator` is the simplest one, which extract mini batches in the main thread.
-:class:`MultiprocessIterator` is a parallelized version of :class:`SerialIterator`. It maintains worker subprocesses to load the next mini batch in parallel.
+:class:`MultiprocessIterator` is a parallelized version of :class:`SerialIterator`. It maintains worker subprocesses to load the next mini-batch in parallel.
 
 
 SerialIterator
diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
index 57b5b7a796d1..79224946c4f5 100644
--- a/docs/source/spelling_wordlist.txt
+++ b/docs/source/spelling_wordlist.txt
@@ -46,6 +46,7 @@ deserialized
 deserializes
 deserializer
 deserializers
+deserializing
 differentiable
 dimensionalities
 dimensionality
@@ -61,7 +62,8 @@ finalizer
 functionalities
 gaussian
 GPU
-grayscale
+grey
+greyscale
 gzip
 hdf
 huber
diff --git a/docs/source/tutorial/basic.rst b/docs/source/tutorial/basic.rst
index a6acfbdbe7ce..03500f4b907a 100644
--- a/docs/source/tutorial/basic.rst
+++ b/docs/source/tutorial/basic.rst
@@ -336,7 +336,7 @@ When we want to train neural networks, we have to run *training loops* that upda
 A typical training loop consists of following procedures:
 
 1. Iterations over training datasets
-2. Preprocessing of extracted minibatches
+2. Preprocessing of extracted mini-batches
 3. Forward/backward computations of the neural networks
 4. Parameter updates
 5. Evaluations of the current parameters on validation datasets
@@ -415,7 +415,7 @@ This MNIST example is also found in the `examples/mnist <https://github.com/pfne
 We show how to use :class:`~training.Trainer` to construct and run the training loop in this section.
 
 We first have to prepare the MNIST dataset.
-The MNIST dataset consists of 70,000 grayscale images of size 28x28 (i.e. 784 pixels) and corresponding digit labels.
+The MNIST dataset consists of 70,000 greyscale images of size 28x28 (i.e. 784 pixels) and corresponding digit labels.
 The dataset is divided into 60,000 training images and 10,000 test images by default.
 We can obtain the vectorized version (i.e., a set of 784 dimensional vectors) by :func:`datasets.get_mnist`.
 
diff --git a/docs/source/tutorial/gpu.rst b/docs/source/tutorial/gpu.rst
index 12d4ef9ee4ae..56b34cd419da 100644
--- a/docs/source/tutorial/gpu.rst
+++ b/docs/source/tutorial/gpu.rst
@@ -418,9 +418,9 @@ So we must manually copy them to ``model_1`` using :meth:`Link.copyparams` metho
 
 .. note::
 
-   If the batchsize used in one model remain the same, the scale of the gradient
+   If the batch size used in one model remain the same, the scale of the gradient
    is roughly proportional to the number of models, when we aggregate
-   gradients from all models by :func:`chainer.Link.addgrads`. So you need to adjust the batchsize
+   gradients from all models by :func:`chainer.Link.addgrads`. So you need to adjust the batch size
    and/or learning rate of the optimizer accordingly.
 
 --------
diff --git a/examples/mnist/train_mnist.py b/examples/mnist/train_mnist.py
index 85619547636c..6fd4b9640a27 100755
--- a/examples/mnist/train_mnist.py
+++ b/examples/mnist/train_mnist.py
@@ -28,7 +28,7 @@ def __call__(self, x):
 def main():
     parser = argparse.ArgumentParser(description='Chainer example: MNIST')
     parser.add_argument('--batchsize', '-b', type=int, default=100,
-                        help='Number of images in each mini batch')
+                        help='Number of images in each mini-batch')
     parser.add_argument('--epoch', '-e', type=int, default=20,
                         help='Number of sweeps over the dataset to train')
     parser.add_argument('--gpu', '-g', type=int, default=-1,
diff --git a/examples/mnist/train_mnist_data_parallel.py b/examples/mnist/train_mnist_data_parallel.py
index 2cf86194c324..930998411dab 100755
--- a/examples/mnist/train_mnist_data_parallel.py
+++ b/examples/mnist/train_mnist_data_parallel.py
@@ -16,7 +16,7 @@ def main():
     # See train_mnist.py for more details.
     parser = argparse.ArgumentParser(description='Chainer example: MNIST')
     parser.add_argument('--batchsize', '-b', type=int, default=400,
-                        help='Number of images in each mini batch')
+                        help='Number of images in each mini-batch')
     parser.add_argument('--epoch', '-e', type=int, default=20,
                         help='Number of sweeps over the dataset to train')
     parser.add_argument('--gpu0', '-g', type=int, default=0,
diff --git a/examples/mnist/train_mnist_model_parallel.py b/examples/mnist/train_mnist_model_parallel.py
index 2608448e8b0d..c582c6a2bd14 100755
--- a/examples/mnist/train_mnist_model_parallel.py
+++ b/examples/mnist/train_mnist_model_parallel.py
@@ -46,7 +46,7 @@ def __call__(self, x):
 def main():
     parser = argparse.ArgumentParser(description='Chainer example: MNIST')
     parser.add_argument('--batchsize', '-b', type=int, default=100,
-                        help='Number of images in each mini batch')
+                        help='Number of images in each mini-batch')
     parser.add_argument('--epoch', '-e', default=20, type=int,
                         help='Number of sweeps over the dataset to train')
     parser.add_argument('--gpu0', '-g', default=0, type=int,
diff --git a/examples/ptb/train_ptb.py b/examples/ptb/train_ptb.py
index 33d269288145..cd17ecb25adb 100755
--- a/examples/ptb/train_ptb.py
+++ b/examples/ptb/train_ptb.py
@@ -61,14 +61,14 @@ def __init__(self, dataset, batch_size, repeat=True):
         self.is_new_epoch = False
         self.repeat = repeat
         length = len(dataset)
-        # Offsets maintain the position of each sequence in the minibatch.
+        # Offsets maintain the position of each sequence in the mini-batch.
         self.offsets = [i * length // batch_size for i in range(batch_size)]
         # NOTE: this is not a count of parameter updates. It is just a count of
         # calls of ``__next__``.
         self.iteration = 0
 
     def __next__(self):
-        # This iterator returns a list representing a mini batch. Each item
+        # This iterator returns a list representing a mini-batch. Each item
         # indicates a different position in the original sequence. Each item is
         # represented by a pair of two word IDs. The first word is at the
         # "current" position, while the second word at the next position.
@@ -152,9 +152,9 @@ def compute_perplexity(result):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--batchsize', '-b', type=int, default=20,
-                        help='Number of examples in each mini batch')
+                        help='Number of examples in each mini-batch')
     parser.add_argument('--bproplen', '-l', type=int, default=35,
-                        help='Number of words in each mini batch '
+                        help='Number of words in each mini-batch '
                              '(= length of truncated BPTT)')
     parser.add_argument('--epoch', '-e', type=int, default=39,
                         help='Number of sweeps over the dataset to train')