-
Notifications
You must be signed in to change notification settings - Fork 6
/
sse_amd64.s
126 lines (106 loc) · 1.95 KB
/
sse_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include "textflag.h"
// AX = &dst[0], BX = &src[0], CX = min(len(dst), len(src))
#define SetupSlices \
MOVQ dst+8(FP), CX ; \
MOVQ src+32(FP), BX ; \
CMPQ CX, BX ; \
CMOVQGT BX, CX ; \
MOVQ dst+0(FP), AX ; \
MOVQ src+24(FP), BX ; \
#define Load4x(FROM, A, B, C, D) \
MOVOU +00(FROM), A; \
MOVOU +16(FROM), B; \
MOVOU +32(FROM), C; \
MOVOU +48(FROM), D;
#define Store4x(INTO, A, B, C, D) \
MOVOU A, +00(INTO); \
MOVOU B, +16(INTO); \
MOVOU C, +32(INTO); \
MOVOU D, +48(INTO);
#define Apply4x(OP) \
OP X0, X4; \
OP X1, X5; \
OP X2, X6; \
OP X3, X7;
// func AddU32_SSE(dst, src []uint32)
TEXT ·AddU32_SSE(SB),NOSPLIT,$0-48
SetupSlices
vector:
SUBQ $16, CX
JL trailing
Load4x(BX, X0, X1, X2, X3)
Load4x(AX, X4, X5, X6, X7)
Apply4x(PADDL)
Store4x(AX, X4, X5, X6, X7)
// increment
ADDQ $64, BX
ADDQ $64, AX
JMP vector
trailing:
ADDQ $17, CX
elem:
DECQ CX
JZ done
MOVL (BX), DX
ADDL DX, (AX)
ADDQ $4, BX
ADDQ $4, AX
JMP elem
done:
RET
// func SubU32_SSE(dst, src []uint32)
TEXT ·SubU32_SSE(SB),NOSPLIT,$0
SetupSlices
vector:
SUBQ $16, CX
JL trailing
Load4x(BX, X0, X1, X2, X3)
Load4x(AX, X4, X5, X6, X7)
Apply4x(PSUBL)
Store4x(AX, X4, X5, X6, X7)
ADDQ $64, BX
ADDQ $64, AX
JMP vector
trailing:
ADDQ $17, CX
elem:
DECQ CX
JZ done
MOVL (BX), DX
// sub
SUBL DX, (AX)
// increment
ADDQ $4, BX
ADDQ $4, AX
JMP elem
done:
RET
// func MulU32_SSE(dst, src []uint32)
TEXT ·MulU32_SSE(SB),NOSPLIT,$0
SetupSlices
vector:
SUBQ $16, CX
JL trailing
Load4x(BX, X0, X1, X2, X3)
Load4x(AX, X4, X5, X6, X7)
Apply4x(PMULLW)
Store4x(AX, X4, X5, X6, X7)
// increment
ADDQ $64, BX
ADDQ $64, AX
JMP vector
trailing:
ADDQ $17, CX
elem:
DECQ CX
JZ done
MOVL (BX), DX
// sub
IMULL (AX), DX
MOVL DX, (AX)
// increment
ADDQ $4, BX
ADDQ $4, AX
JMP elem
done:
RET