diff --git a/performer/fast_attention/tensorflow/fast_attention.py b/performer/fast_attention/tensorflow/fast_attention.py
index 32e6926dd11..8362bf639cd 100644
--- a/performer/fast_attention/tensorflow/fast_attention.py
+++ b/performer/fast_attention/tensorflow/fast_attention.py
@@ -169,13 +169,14 @@ def softmax_kernel_transformation(data,
   """
   data_normalizer = 1.0 / (
       tf.math.sqrt(tf.math.sqrt(tf.dtypes.cast(data.shape[-1], tf.float32))))
+  data = data_normalizer * data
   ratio = 1.0 / tf.math.sqrt(
       tf.dtypes.cast(projection_matrix.shape[0], tf.float32))
   data_dash = tf.einsum("blhd,md->blhm", data, projection_matrix)
   diag_data = tf.math.square(data)
   diag_data = tf.math.reduce_sum(
       diag_data, axis=tf.keras.backend.ndim(data) - 1)
-  diag_data = (diag_data / 2.0) * data_normalizer * data_normalizer
+  diag_data = diag_data / 2.0
   diag_data = tf.expand_dims(diag_data, axis=tf.keras.backend.ndim(data) - 1)
   if is_query:
     last_dims_t = (len(data_dash.shape) - 1,)
diff --git a/performer/models/slim_performer/pytorch/train.py b/performer/models/slim_performer/pytorch/train.py
index 7bfc1c9d4bd..33bc34f7983 100644
--- a/performer/models/slim_performer/pytorch/train.py
+++ b/performer/models/slim_performer/pytorch/train.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Example of training the SLiMPerformer on PennTreeBank and Enwik8 data."""
+"""Example of training the SLiMPerformer on PennTreeBank and Enwik8 data, as well as the copy task."""
 import collections
 import gzip
 import os