diff --git a/docs/layers/core.md b/docs/layers/core.md
index 575e739..7bc2159 100644
--- a/docs/layers/core.md
+++ b/docs/layers/core.md
@@ -16,7 +16,7 @@ An alternative to Keras [Dropout](https://keras.io/layers/core/#dropout) which s
 gandlf.layers.BatchSimilarity(similarity='exp_l1')
 ````
 
-Calculates the minibatch similarities, a trick introduced in [Improved Techniques for Training GANs](https://arxiv.org/abs/1606.03498). These similarities can be added as features for the existing layer by using a Merge layer. The layer outputs a Tensor with shape `(batch_size, num_similarities)` for 2D tensors, `(batch_size, None, num_similarities)` for 3D Tensors, and so on.
+Calculates the minibatch similarities, a trick introduced in [Improved Techniques for Training GANs](https://arxiv.org/abs/1606.03498). These similarities can be added as features for the existing layer by using a Merge layer. The layer takes as input a 2D Tensor with shape `(batch_size, num_features)` and outputs a Tensor with shape `(batch_size, num_similarities)`, where `num_similarities` is the total number of computed similarities.
 
 In order to make this layer linear time with respect to the batch size, instead of doing a pairwise comparison between each pair of samples in the batch, for each sample a random sample is uniformly selected with which to do pairwise comparison.
 
diff --git a/docs/layers/wrappers.md b/docs/layers/wrappers.md
index 563262c..008b2e5 100644
--- a/docs/layers/wrappers.md
+++ b/docs/layers/wrappers.md
@@ -8,7 +8,7 @@ gandlf.layers.Residual(layer, merge_mode='sum')
 
 Applies a residual to any Keras layer or model, so long as it's inputs are the same dimension as its outputs. Useful for implementing residual architectures.
 
-The provided `layer` has the have the same input and output dimensions. Given an input `x`, the output is:
+The provided `layer` has to have the same input and output dimensions. Given an input `x`, the output is:
 
 ````python
 output = merge_mode(x, layer(x))
diff --git a/examples/mnist_rnn_gan.py b/examples/mnist_rnn_gan.py
index c53cd50..50aec40 100644
--- a/examples/mnist_rnn_gan.py
+++ b/examples/mnist_rnn_gan.py
@@ -87,7 +87,7 @@ def build_discriminator(mode):
     """Builds the discriminator model."""
 
     image = keras.layers.Input((28, 28, 1), name='real_data')
-    flat = keras.layers.Reshape((28, 28))(image)
+    rnn_input = keras.layers.Reshape((28, 28))(image)
 
     rnn_1 = keras.layers.LSTM(128, return_sequences=True)
     rnn_2 = keras.layers.LSTM(1, return_sequences=False, name='pred_fake')
@@ -100,22 +100,22 @@ def build_discriminator(mode):
         rnn_1_attn = gandlf.layers.RecurrentAttention1D(rnn_1, embedded)
         rnn_2_attn = gandlf.layers.RecurrentAttention1D(rnn_2, embedded,
                                                         name='pred_fake')
-        pred_fake = rnn_2_attn(rnn_1_attn(flat))
+        pred_fake = rnn_2_attn(rnn_1_attn(rnn_input))
         return keras.models.Model(input=[image, input_class],
                                   output=pred_fake)
 
     elif mode == '2d':  # Pay attention to whole image.
         ref_image = keras.layers.Input((28, 28, 1), name='ref_image_dis')
-        flat = keras.layers.Reshape((28, 28))(ref_image)
-        rnn_1_attn = gandlf.layers.RecurrentAttention2D(rnn_1, flat)
-        rnn_2_attn = gandlf.layers.RecurrentAttention2D(rnn_2, flat,
+        attn_reshaped = keras.layers.Reshape((28, 28))(ref_image)
+        rnn_1_attn = gandlf.layers.RecurrentAttention2D(rnn_1, attn_reshaped)
+        rnn_2_attn = gandlf.layers.RecurrentAttention2D(rnn_2, attn_reshaped,
                                                         name='pred_fake')
-        pred_fake = rnn_2_attn(rnn_1_attn(flat))
+        pred_fake = rnn_2_attn(rnn_1_attn(rnn_input))
         return keras.models.Model(input=[image, ref_image],
                                   output=pred_fake)
 
     else:
-        pred_fake = rnn_2(rnn_1(flat))
+        pred_fake = rnn_2(rnn_1(rnn_input))
         return keras.models.Model(input=image,
                                   output=pred_fake)
 
diff --git a/gandlf/layers/core.py b/gandlf/layers/core.py
index 2c23217..1bbe4de 100644
--- a/gandlf/layers/core.py
+++ b/gandlf/layers/core.py
@@ -26,9 +26,8 @@ class BatchSimilarity(keras.layers.Layer):
     """Calculates intrabatch similarity, for minibatch discrimination.
 
     The minibatch similarities can be added as features for the existing
-    layer by using a Merge layer. The layer outputs a Tensor with shape
-    (batch_size, num_similarities) for 2D tensors, (batch_size, None,
-    num_similarities) for 3D tensors, and so on.
+    layer by using a Merge layer. The layer only works for inputs with shape
+    (batch_size, num_features). Inputs with more dimensions can be flattened.
 
     In order to make this layer linear time with respect to the batch size,
     instead of doing a pairwise comparison between each pair of samples in
@@ -40,38 +39,49 @@ class BatchSimilarity(keras.layers.Layer):
             possible types. Alternatively, it can be a function which takes
             two tensors as inputs and returns their similarity. A list or
             tuple of similarities will apply all the similarities.
+        n: int or list of ints (one for each similarity), number of times to
+            repeat each similarity, using a different sample to calculate the
+            other similarity.
 
     Reference: "Improved Techniques for Training GANs"
         https://arxiv.org/abs/1606.03498
     """
 
-    def __init__(self, similarity='exp_l1', **kwargs):
-        if isinstance(similarity, (list, tuple)):
-            self.similarities = [similarities.get(s) for s in similarity]
-        else:
-            self.similarities = [similarities.get(similarity)]
+    def __init__(self, similarity='exp_l1', n=1, **kwargs):
+        if not isinstance(similarity, (list, tuple)):
+            similarity = [similarity]
+        if not isinstance(n, (list, tuple)):
+            n = [n for _ in similarity]
+
+        self.similarities = [similarities.get(s) for s in similarity]
+        self.n = n
+
         super(BatchSimilarity, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        if len(input_shape) < 2:
+        if len(input_shape) == 2:
             raise ValueError('The input to a BatchSimilarity layer must be '
-                             'at least 2D. Got %d dims.' % len(input_shape))
+                             '2D. Got %d dims.' % len(input_shape))
 
     def call(self, x, mask=None):
         sims = []
-        for sim in self.similarities:
-            batch_size = K.shape(x)[0]
-            idx = K.random_uniform((batch_size,), low=0, high=batch_size,
-                                   dtype='int32')
-            x_shuffled = K.gather(x, idx)
-            sims.append(sim(x, x_shuffled))
+        for n, sim in zip(self.n, self.similarities):
+            for _ in range(n):
+                batch_size = K.shape(x)[0]
+                idx = K.random_uniform((batch_size,), low=0, high=batch_size,
+                                       dtype='int32')
+                x_shuffled = K.gather(x, idx)
+                pair_sim = sim(x, x_shuffled)
+                for _ in range(K.ndim(x) - 1):
+                    pair_sim = K.expand_dims(pair_sim, dim=1)
+                sims.append(pair_sim)
 
         return K.concatenate(sims, axis=-1)
 
     def get_output_shape_for(self, input_shape):
         if len(input_shape) < 2:
             raise ValueError('The input to a BatchSimilarity layer must be '
-                             'at least 2D. Got %d dims.' % len(input_shape))
+                             '2D. Got %d dims.' % len(input_shape))
         output_shape = list(input_shape)
         output_shape[-1] = len(self.similarities)
         return tuple(output_shape)
diff --git a/gandlf/similarities.py b/gandlf/similarities.py
index 474059b..505e983 100644
--- a/gandlf/similarities.py
+++ b/gandlf/similarities.py
@@ -1,13 +1,3 @@
-"""
-Guidelines: All similarities take two tensors and return a tensor with
-shape (batch_size, 1 ... 1), which is the sample-wise similarity between the
-two tensors.
-
-Each similarity function is:
- - Symmetric, so similarity(a, b) == similarity(b, a)
- - Monotonically increasing with respect to `a - b`, for a >= b
-"""
-
 import keras
 import keras.backend as K
 from keras.utils.generic_utils import get_from_module
@@ -28,13 +18,13 @@ def exp_l2(a, b):
 def l1(a, b):
     """L1 similarity. Maximum is 0 (a == b), minimum is -inf."""
 
-    return -K.sum(K.abs(a - b), axis=range(1, K.ndim(a)), keepdims=True)
+    return -K.sum(K.abs(a - b), axis=-1)
 
 
 def l2(a, b):
     """L2 similarity. Maximum is 0 (a == b), minimum is -inf."""
 
-    return -K.sum(K.square(a - b), axis=range(1, K.ndim(a)), keepdims=True)
+    return -K.sum(K.square(a - b), axis=-1)
 
 
 def cosine(a, b):
@@ -42,19 +32,19 @@ def cosine(a, b):
 
     a = K.l2_normalize(a)
     b = K.l2_normalize(b)
-    return 1 - K.mean(a * b, axis=range(1, K.ndim(a)), keepdims=True)
+    return 1 - K.mean(a * b, axis=-1)
 
 
 def sigmoid(a, b):
     """Sigmoid similarity. Maximum is 1 (a == b), minimum is 0."""
 
-    return K.sigmoid(K.sum(a * b, axis=range(1, K.ndim(a)), keepdims=True))
+    return K.sigmoid(K.sum(a * b, axis=-1)
 
 
 def euclidean(a, b):
     """Euclidian similarity. Maximum is 1 (a == b), minimum is 0 (a == -b)."""
 
-    x = K.sum(K.square(a - b), axis=range(1, K.ndim(a)), keepdims=True)
+    x = K.sum(K.square(a - b), axis=-1)
     return 1. / (1. + x)