-
Notifications
You must be signed in to change notification settings - Fork 392
/
ARM64arrayCopy.spp
366 lines (346 loc) · 9.79 KB
/
ARM64arrayCopy.spp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/*******************************************************************************
* Copyright (c) 2021, 2021 IBM Corp. and others
*
* This program and the accompanying materials are made available under
* the terms of the Eclipse Public License 2.0 which accompanies this
* distribution and is available at http://eclipse.org/legal/epl-2.0
* or the Apache License, Version 2.0 which accompanies this distribution
* and is available at https://www.apache.org/licenses/LICENSE-2.0.
*
* This Source Code may also be made available under the following Secondary
* Licenses when the conditions for such availability set forth in the
* Eclipse Public License, v. 2.0 are satisfied: GNU General Public License,
* version 2 with the GNU Classpath Exception [1] and GNU General Public
* License, version 2 with the OpenJDK Assembly Exception [2].
*
* [1] https://www.gnu.org/software/classpath/license.html
* [2] http://openjdk.java.net/legal/assembly-exception.html
*
* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception
*******************************************************************************/
.file "ARM64ArrayCopy.s"
.global __arrayCopy
.global __forwardArrayCopy
.global __backwardArrayCopy
.global __fwHalfWordArrayCopy
.global __fwWordArrayCopy
.global __fwDoubleWordArrayCopy
.global __fwQuadWordArrayCopy
.global __bwHalfWordArrayCopy
.global __bwWordArrayCopy
.global __bwDoubleWordArrayCopy
.global __bwQuadWordArrayCopy
.text
.align 2
// These are entry points for halfword, word and doubleword alignment of arrays to be copied
// in forward direction to avoid multiple checks in generic forward copy entry point,
// when the alignment is known.
//
// in: x0 - length in bytes
// x1 - src addr
// x2 - dst addr
__fwHalfWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b fwBothAlign2
__fwWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b fwBothAlign4
__fwDoubleWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b fwBothAlign8
__fwQuadWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b fwBothAlign16
// These are entry points for halfword, word and doubleword alignment of arrays to be copied
// in backward direction to avoid multiple checks in generic backward copy entry point,
// when the alignment is known.
//
// in: x0 - length in bytes
// x1 - src addr + length
// x2 - dst addr + length
__bwHalfWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b bwBothAlign2
__bwWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b bwBothAlign4
__bwDoubleWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b bwBothAlign8
__bwQuadWordArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
b bwBothAlign16
// This is a generic entry point that will determine which direction(forward/backward) to copy as appropriate.
//
// in: x0 - length in bytes
// x1 - src addr
// x2 - dst addr
// trash: x3, x4
__arrayCopy:
cbz x0, finished // return if no bytes to copy
subs x3, x2, x1
beq finished // return if srcAddr == dstAddr
cmp x0, x3
bhi adjustAddressForBackwardCopy // byteLength > dstAddr - srcAddr, must do backward array copy, hence adjusting the address
// Forward copy case: fall through
// This assembler function can be called during runtime,
// instead of emmitting these instructions through functions.
// Forward arraycopy function checks the alignment of the data
// and goes into the respective loop to copy elements in forward direction.
//
// in: x0 - length in bytes
// x1 - src addr
// x2 - dst addr
// trash: x3, x4
__forwardArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
tst x2, #1
beq fwDstAlign2 // dstAddr is 2-byte aligned
ldrb w3, [x1], #1
sub x0, x0, #1
strb w3, [x2], #1
fwDstAlign2:
cmp x0, #2
blt fwByteCopy // less than 2 bytes remaining
tst x1, #1
bne fwByteCopyLoop // srcAddr is not 2-byte aligned
fwBothAlign2:
tst x2, #2
beq fwDstAlign4 // dstAddr is 4-byte aligned
ldrh w3, [x1], #2
sub x0, x0, #2
strh w3, [x2], #2
fwDstAlign4:
cmp x0, #4
blt fwHalfWordCopy // less than 4 bytes remaining
tst x1, #2
bne fwHalfWordCopyLoop // srcAddr is not 4-byte aligned
fwBothAlign4:
tst x2, #4
beq fwDstAlign8 // dstAddr is 8-byte aligned
ldr w3, [x1], #4
sub x0, x0, #4
str w3, [x2], #4
fwDstAlign8:
cmp x0, #8
blt fwWordCopy // less than 8 bytes remaining
tst x1, #4
bne fwWordCopyLoop // srcAddr is not 8-byte aligned
fwBothAlign8:
tst x2, #8
beq fwDstAlign16 // dstAddr is 16-byte aligned
ldr x3, [x1], #8
sub x0, x0, #8
str x3, [x2], #8
fwDstAlign16:
tst x1, #8
bne fwDoubleWordCopyLoop // srcAddr is not 16-byte aligned
fwQuadWordCopyLoop:
// Both srcAddr and dstAddr are 16-byte aligned
cmp x0, #16
blt fwDoubleWordCopy // less than 16 bytes remaining
fwBothAlign16:
ldp x3, x4, [x1], #16
sub x0, x0, #16
stp x3, x4, [x2], #16
b fwQuadWordCopyLoop
fwDoubleWordCopyLoop:
// Both srcAddr and dstAddr are 8-byte aligned
cmp x0, #8
blt fwWordCopy // less than 8 bytes remaining
ldr x3, [x1], #8
sub x0, x0, #8
str x3, [x2], #8
b fwDoubleWordCopyLoop
fwWordCopyLoop:
// Both srcAddr and dstAddr are 4-byte aligned
cmp x0, #4
blt fwHalfWordCopy
ldr w3, [x1], #4 // less than 4 bytes remaining
sub x0, x0, #4
str w3, [x2], #4
b fwWordCopyLoop
fwHalfWordCopyLoop:
// Both srcAddr and dstAddr are 2-byte aligned
cmp x0, #2
blt fwByteCopy // less than 2 bytes remaining
ldrh w3, [x1], #2
sub x0, x0, #2
strh w3, [x2], #2
b fwHalfWordCopyLoop
fwByteCopyLoop:
cbz x0, finished
ldrb w3, [x1], #1
sub x0, x0, #1
strb w3, [x2], #1
b fwByteCopyLoop
fwDoubleWordCopy:
// Both srcAddr and dstAddr are 8-byte aligned
cmp x0, #8
blt fwWordCopy
ldr x3, [x1], #8
sub x0, x0, #8
str x3, [x2], #8
fwWordCopy:
// Both srcAddr and dstAddr are 4-byte aligned
cmp x0, #4
blt fwHalfWordCopy
ldr w3, [x1], #4
sub x0, x0, #4
str w3, [x2], #4
fwHalfWordCopy:
// Both srcAddr and dstAddr are 2-byte aligned
cmp x0, #2
blt fwByteCopy
ldrh w3, [x1], #2
sub x0, x0, #2
strh w3, [x2], #2
fwByteCopy:
cbz x0, finished
ldrb w3, [x1], #1
sub x0, x0, #1
strb w3, [x2], #1
finished:
ret
adjustAddressForBackwardCopy:
add x1, x1, x0
add x2, x2, x0
// Fall through to __backwardArrayCopy
// This assembler function can be called during runtime,
// instead of emmitting these instructions through functions.
// Backward arraycopy function checks the alignment of the data
// and goes into the respective loop to copy elements in backward direction.
//
// in: x0 - length in bytes
// x1 - src addr + length
// x2 - dst addr + length
// trash: x3, x4
__backwardArrayCopy:
cbz x0, finished // return if no bytes to copy
cmp x1, x2
beq finished // return if srcAddr == dstAddr
tst x2, #1
beq bwDstAlign2 // dstAddr is 2-byte aligned
ldrb w3, [x1, #-1]!
sub x0, x0, #1
strb w3, [x2, #-1]!
bwDstAlign2:
cmp x0, #2
blt bwByteCopy // less than 2 bytes remaining
tst x1, #1
bne bwByteCopyLoop // srcAddr is not 2-byte aligned
bwBothAlign2:
tst x2, #2
beq bwDstAlign4 // dstAddr is 4-byte aligned
ldrh w3, [x1, #-2]!
sub x0, x0, #2
strh w3, [x2, #-2]!
bwDstAlign4:
cmp x0, #4
blt bwHalfWordCopy // less than 4 bytes remaining
tst x1, #2
bne bwHalfWordCopyLoop // srcAddr is not 4-byte aligned
bwBothAlign4:
tst x2, #4
beq bwDstAlign8 // dstAddr is 8-byte aligned
ldr w3, [x1, #-4]!
sub x0, x0, #4
str w3, [x2, #-4]!
bwDstAlign8:
cmp x0, #8
blt bwWordCopy // less than 8 bytes remaining
tst x1, #4
bne bwWordCopyLoop // srcAddr is not 8-byte aligned
bwBothAlign8:
tst x2, #8
beq bwDstAlign16 // dstAddr is 16-byte aligned
ldr x3, [x1, #-8]!
sub x0, x0, #8
str x3, [x2, #-8]!
bwDstAlign16:
tst x1, #8
bne bwDoubleWordCopyLoop // srcAddr is not 16-byte aligned
bwQuadWordCopyLoop:
// Both srcAddr and dstAddr are 16-byte aligned
cmp x0, #16
blt bwDoubleWordCopy // less than 16 bytes remaining
bwBothAlign16:
ldp x3, x4, [x1, #-16]!
sub x0, x0, #16
stp x3, x4, [x2, #-16]!
b bwQuadWordCopyLoop
bwDoubleWordCopyLoop:
// Both srcAddr and dstAddr are 8-byte aligned
cmp x0, #8
blt bwWordCopy // less than 8 bytes remaining
ldr x3, [x1, #-8]!
sub x0, x0, #8
str x3, [x2, #-8]!
b bwDoubleWordCopyLoop
bwWordCopyLoop:
// Both srcAddr and dstAddr are 4-byte aligned
cmp x0, #4
blt bwHalfWordCopy
ldr w3, [x1, #-4]! // less than 4 bytes remaining
sub x0, x0, #4
str w3, [x2, #-4]!
b bwWordCopyLoop
bwHalfWordCopyLoop:
// Both srcAddr and dstAddr are 2-byte aligned
cmp x0, #2
blt bwByteCopy // less than 2 bytes remaining
ldrh w3, [x1, #-2]!
sub x0, x0, #2
strh w3, [x2, #-2]!
b bwHalfWordCopyLoop
bwByteCopyLoop:
cbz x0, finished
ldrb w3, [x1, #-1]!
sub x0, x0, #1
strb w3, [x2, #-1]!
b bwByteCopyLoop
bwDoubleWordCopy:
// Both srcAddr and dstAddr are 8-byte aligned
cmp x0, #8
blt bwWordCopy
ldr x3, [x1, #-8]!
sub x0, x0, #8
str x3, [x2, #-8]!
bwWordCopy:
// Both srcAddr and dstAddr are 4-byte aligned
cmp x0, #4
blt bwHalfWordCopy
ldr w3, [x1, #-4]!
sub x0, x0, #4
str w3, [x2, #-4]!
bwHalfWordCopy:
// Both srcAddr and dstAddr are 2-byte aligned
cmp x0, #2
blt bwByteCopy
ldrh w3, [x1, #-2]!
sub x0, x0, #2
strh w3, [x2, #-2]!
bwByteCopy:
cbz x0, finished
ldrb w3, [x1, #-1]!
sub x0, x0, #1
strb w3, [x2, #-1]!
ret