Skip to content

Commit

Permalink
asm/f64: add asm routine for sum
Browse files Browse the repository at this point in the history
  • Loading branch information
Kunde21 committed Aug 1, 2018
1 parent 1c02c29 commit b35a355
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 5 deletions.
6 changes: 1 addition & 5 deletions floats/floats.go
Original file line number Diff line number Diff line change
Expand Up @@ -899,11 +899,7 @@ func SubTo(dst, s, t []float64) []float64 {

// Sum returns the sum of the elements of the slice.
func Sum(s []float64) float64 {
var sum float64
for _, val := range s {
sum += val
}
return sum
return f64.Sum(s)
}

// Within returns the first index i where s[i] <= v < s[i+1]. Within panics if:
Expand Down
7 changes: 7 additions & 0 deletions internal/asm/f64/stubs_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,10 @@ func ScalInc(alpha float64, x []float64, n, incX uintptr)
// idst += incDst
// }
func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)

// Sum is
// var sum float64
// for i := range x {
// sum += x[i]
// }
func Sum(x []float64) float64
13 changes: 13 additions & 0 deletions internal/asm/f64/stubs_noasm.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,16 @@ func LinfDist(s, t []float64) float64 {
}
return norm
}

// Sum is
// var sum float64
// for i := range x {
// sum += x[i]
// }
func Sum(x []float64) float64 {
var sum float64
for i := range x {
sum += x[i]
}
return sum
}
61 changes: 61 additions & 0 deletions internal/asm/f64/stubs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ func TestLinfDist(t *testing.T) {
expect: 6,
},
} {

sg_ln, tg_ln := 4+j%2, 4+j%3
v.s, v.t = guardVector(v.s, s_gd, sg_ln), guardVector(v.t, t_gd, tg_ln)
s_lc, t_lc := v.s[sg_ln:len(v.s)-sg_ln], v.t[tg_ln:len(v.t)-tg_ln]
Expand All @@ -572,3 +573,63 @@ func TestLinfDist(t *testing.T) {
}
}
}

func TestSum(t *testing.T) {
var srcGd float64 = -1
for j, v := range []struct {
src []float64
expect float64
}{
{
src: []float64{},
expect: 0,
},
{
src: []float64{1},
expect: 1,
},
{
src: []float64{nan},
expect: nan,
},
{
src: []float64{1, 2, 3},
expect: 6,
},
{
src: []float64{1, -4, 3},
expect: 0,
},
{
src: []float64{1, 2, 3, 4},
expect: 10,
},
{
src: []float64{1, 1, nan, 1, 1},
expect: nan,
},
{
src: []float64{inf, 4, nan, -inf, 9},
expect: nan,
},
{
src: []float64{1, 1, 1, 1, 9, 1, 1, 1, 2, 1, 1, 1, 1, 1, 5, 1},
expect: 29,
},
{
src: []float64{1, 1, 1, 1, 9, 1, 1, 1, 2, 1, 1, 1, 1, 1, 5, 11, 1, 1, 1, 9, 1, 1, 1, 2, 1, 1, 1, 1, 1, 5, 1},
expect: 67,
},
} {
gdLn := 4 + j%2
v.src = guardVector(v.src, srcGd, gdLn)
src := v.src[gdLn : len(v.src)-gdLn]
ret := Sum(src)
if !same(ret, v.expect) {
t.Errorf("Test %d Sum error Got: %v Expected: %v", j, ret, v.expect)
}
if !isValidGuard(v.src, srcGd, gdLn) {
t.Errorf("Test %d Guard violated in src vector %v %v", j, v.src[:gdLn], v.src[len(v.src)-gdLn:])
}
}
}
100 changes: 100 additions & 0 deletions internal/asm/f64/sum_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !noasm,!appengine,!safe

#include "textflag.h"

#define X_PTR SI
#define IDX AX
#define LEN CX
#define TAIL BX
#define SUM X0
#define SUM_1 X1
#define SUM_2 X2
#define SUM_3 X3

// func Sum(x []float64) float64
TEXT ·Sum(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ x_len+8(FP), LEN // LEN = len(x)
XORQ IDX, IDX // i = 0
PXOR SUM, SUM // p_sum_i = 0
CMPQ LEN, $0 // if LEN == 0 { return 0 }
JE sum_end

PXOR SUM_1, SUM_1
PXOR SUM_2, SUM_2
PXOR SUM_3, SUM_3

MOVQ X_PTR, TAIL // Check memory alignment
ANDQ $15, TAIL // TAIL = &y % 16
JZ no_trim // if TAIL == 0 { goto no_trim }

// Align on 16-byte boundary
ADDSD (X_PTR), X0 // X0 += x[0]
INCQ IDX // i++
DECQ LEN // LEN--
DECQ TAIL // TAIL--
JZ sum_end // if TAIL == 0 { return }

no_trim:
MOVQ LEN, TAIL
SHRQ $4, LEN // LEN = floor( n / 16 )
JZ sum_tail_start // if LEN == 0 { goto tail_start }

sum_loop: // sum 16x wide do {
ADDPD (SI)(AX*8), SUM // sum_i += x[i:i+1]
ADDPD 16(SI)(AX*8), SUM_1
ADDPD 32(SI)(AX*8), SUM_2
ADDPD 48(SI)(AX*8), SUM_3
ADDPD 64(SI)(AX*8), SUM
ADDPD 80(SI)(AX*8), SUM_1
ADDPD 96(SI)(AX*8), SUM_2
ADDPD 112(SI)(AX*8), SUM_3
ADDQ $16, IDX // i += 16
DECQ LEN
JNZ sum_loop // } while --CX > 0

sum_tail_start: // Reset loop registers
TESTQ $8, TAIL
JZ sum_tail4

ADDPD (SI)(AX*8), SUM // sum_i += x[i:i+1]
ADDPD 16(SI)(AX*8), SUM_1
ADDPD 32(SI)(AX*8), SUM_2
ADDPD 48(SI)(AX*8), SUM_3
ADDQ $8, IDX

sum_tail4:
ADDPD SUM_3, SUM
ADDPD SUM_2, SUM_1

TESTQ $4, TAIL
JZ sum_tail2

ADDPD (SI)(AX*8), SUM // sum_i += x[i:i+1]
ADDPD 16(SI)(AX*8), SUM_1
ADDQ $4, IDX

sum_tail2:
ADDPD SUM_1, SUM

TESTQ $2, TAIL
JZ sum_tail1

ADDPD (SI)(AX*8), SUM // sum_i += x[i:i+1]
ADDQ $2, IDX

sum_tail1:
HADDPD SUM, SUM

TESTQ $1, TAIL
JZ sum_end

ADDSD (SI)(IDX*8), SUM

sum_end: // return sum
MOVSD SUM, sum+24(FP)
RET

0 comments on commit b35a355

Please sign in to comment.