From 23f75541946884167364a5cc513699661dcfe8ff Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Date: Fri, 21 Sep 2018 16:06:32 -0300
Subject: [PATCH] internal/bytealg: improve performance of IndexByte for ppc64x

Use addi+lvx instruction fusion and remove register dependencies in
the main loop to improve performance.

benchmark                      old ns/op     new ns/op     delta
BenchmarkIndexByte/10-192      9.86          9.75          -1.12%
BenchmarkIndexByte/32-192      15.6          11.2          -28.21%
BenchmarkIndexByte/4K-192      155           97.6          -37.03%
BenchmarkIndexByte/4M-192      171790        129650        -24.53%
BenchmarkIndexByte/64M-192     6530982       5018424       -23.16%

benchmark                      old MB/s     new MB/s     speedup
BenchmarkIndexByte/10-192      1013.72      1025.76      1.01x
BenchmarkIndexByte/32-192      2049.47      2868.01      1.40x
BenchmarkIndexByte/4K-192      26422.69     41975.67     1.59x
BenchmarkIndexByte/4M-192      24415.17     32350.74     1.33x
BenchmarkIndexByte/64M-192     10275.46     13372.50     1.30x

Change-Id: Iedf17f01f374d58e85dcd6a972209bfcb7eb6063
Reviewed-on: https://go-review.googlesource.com/137415
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/internal/bytealg/indexbyte_ppc64x.s | 28 ++++++++++++++-----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
index ccf897d99c451..61b33bc9cb27b 100644
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -38,14 +38,14 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
 	BR	indexbytebody<>(SB)
 
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-	DCBT	(R3)		// Prepare cache line.
 	MOVD	R3,R17		// Save base address for calculating the index later.
 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
 	ADD	R4,R3,R7	// Last acceptable address in R7.
+	DCBT	(R8)		// Prepare cache line.
 
 	RLDIMI	$16,R5,$32,R5
-	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
+	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
 	MOVD	$-1,R9
 	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
 	RLDIMI	$32,R5,$0,R5
@@ -56,7 +56,7 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 #else
 	SRD	R6,R9,R9	// Same for Big Endian
 #endif
-	BLE	small_string	// Jump to the small string case if it's <32 bytes.
+	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.
 
 	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
 	// in V0, V1 and V10, then branch to the preloop.
@@ -97,7 +97,7 @@ qw_align:
 	LVSL	  (R0+R0),V11
 	VSLB	  V11,V10,V10
 	VSPLTB	  $7,V1,V1	// Replicate byte across V1
-	CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop
+	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
 	BLE	  tail
 
 	// We will load 4 quardwords per iteration in the loop, so check for
@@ -131,7 +131,7 @@ qw_align:
 	// 64-byte aligned. Prepare for the main loop.
 preloop:
 	CMPU	R4,$64
-	BLE	tail	      // If len <= 64, don't use the vectorized loop
+	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
 
 	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
 	// per loop iteration. The last doubleword is in R10, so our loop counter
@@ -140,30 +140,34 @@ preloop:
 	SRD	$6,R6,R9      // Loop counter in R9
 	MOVD	R9,CTR
 
+	ADD	$-64,R8,R8   // Adjust index for loop entry
 	MOVD	$16,R11      // Load offsets for the vector loads
 	MOVD	$32,R9
 	MOVD	$48,R7
 
 	// Main loop we will load 64 bytes per iteration
 loop:
+	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
 	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
-	LVX	    (R11+R8),V3
-	LVX	    (R9+R8),V4
-	LVX	    (R7+R8),V5
+	LVX	    (R8+R11),V3
 	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
 	VCMPEQUB    V1,V3,V7
+
+	LVX	    (R8+R9),V4
+	LVX	    (R8+R7),V5
 	VCMPEQUB    V1,V4,V8
 	VCMPEQUB    V1,V5,V9
+
 	VOR	    V6,V7,V11	      // Compress the result in a single vector
 	VOR	    V8,V9,V12
-	VOR	    V11,V12,V11
-	VCMPEQUBCC  V0,V11,V11	      // Check for byte
+	VOR	    V11,V12,V13
+	VCMPEQUBCC  V0,V13,V14	      // Check for byte
 	BGE	    CR6,found
-	ADD	    $64,R8,R8
 	BC	    16,0,loop	      // bdnz loop
 
-	// Handle the tailing bytes or R4 <= 64
+	// Handle the tailing bytes or R4 ≤ 64
 	RLDICL	$0,R6,$58,R4
+	ADD	$64,R8,R8
 tail:
 	CMPU	    R4,$0
 	BEQ	    notfound