@@ -155,34 +155,27 @@ cpout: retl ! get outta here
155155 .text ; \
156156 .align 4
157157
158- #define EXT(start,end) \
159- .section __ex_table,ALLOC; \
160- .align 4 ; \
161- .word start, 0 , end, cc_fault; \
162- .text ; \
163- .align 4
164-
165158 /* This aligned version executes typically in 8.5 superscalar cycles, this
166159 * is the best I can do. I say 8.5 because the final add will pair with
167160 * the next ldd in the main unrolled loop. Thus the pipe is always full.
168161 * If you change these macros (including order of instructions),
169162 * please check the fixup code below as well.
170163 */
171164#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
172- ldd [src + off + 0x00 ], t0; \
173- ldd [src + off + 0x08 ], t2; \
165+ EX( ldd [src + off + 0x00 ], t0); \
166+ EX( ldd [src + off + 0x08 ], t2); \
174167 addxcc t0, sum, sum; \
175- ldd [src + off + 0x10 ], t4; \
168+ EX( ldd [src + off + 0x10 ], t4); \
176169 addxcc t1, sum, sum; \
177- ldd [src + off + 0x18 ], t6; \
170+ EX( ldd [src + off + 0x18 ], t6); \
178171 addxcc t2, sum, sum; \
179- std t0, [dst + off + 0x00 ]; \
172+ EX( std t0, [dst + off + 0x00 ]); \
180173 addxcc t3, sum, sum; \
181- std t2, [dst + off + 0x08 ]; \
174+ EX( std t2, [dst + off + 0x08 ]); \
182175 addxcc t4, sum, sum; \
183- std t4, [dst + off + 0x10 ]; \
176+ EX( std t4, [dst + off + 0x10 ]); \
184177 addxcc t5, sum, sum; \
185- std t6, [dst + off + 0x18 ]; \
178+ EX( std t6, [dst + off + 0x18 ]); \
186179 addxcc t6, sum, sum; \
187180 addxcc t7, sum, sum;
188181
@@ -191,39 +184,39 @@ cpout: retl ! get outta here
191184 * Viking MXCC into streaming mode. Ho hum...
192185 */
193186#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
194- ldd [src + off + 0x00 ], t0; \
195- ldd [src + off + 0x08 ], t2; \
196- ldd [src + off + 0x10 ], t4; \
197- ldd [src + off + 0x18 ], t6; \
198- st t0, [dst + off + 0x00 ]; \
187+ EX( ldd [src + off + 0x00 ], t0); \
188+ EX( ldd [src + off + 0x08 ], t2); \
189+ EX( ldd [src + off + 0x10 ], t4); \
190+ EX( ldd [src + off + 0x18 ], t6); \
191+ EX( st t0, [dst + off + 0x00 ]); \
199192 addxcc t0, sum, sum; \
200- st t1, [dst + off + 0x04 ]; \
193+ EX( st t1, [dst + off + 0x04 ]); \
201194 addxcc t1, sum, sum; \
202- st t2, [dst + off + 0x08 ]; \
195+ EX( st t2, [dst + off + 0x08 ]); \
203196 addxcc t2, sum, sum; \
204- st t3, [dst + off + 0x0c ]; \
197+ EX( st t3, [dst + off + 0x0c ]); \
205198 addxcc t3, sum, sum; \
206- st t4, [dst + off + 0x10 ]; \
199+ EX( st t4, [dst + off + 0x10 ]); \
207200 addxcc t4, sum, sum; \
208- st t5, [dst + off + 0x14 ]; \
201+ EX( st t5, [dst + off + 0x14 ]); \
209202 addxcc t5, sum, sum; \
210- st t6, [dst + off + 0x18 ]; \
203+ EX( st t6, [dst + off + 0x18 ]); \
211204 addxcc t6, sum, sum; \
212- st t7, [dst + off + 0x1c ]; \
205+ EX( st t7, [dst + off + 0x1c ]); \
213206 addxcc t7, sum, sum;
214207
215208 /* Yuck, 6 superscalar cycles... */
216209#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \
217- ldd [src - off - 0x08 ], t0; \
218- ldd [src - off - 0x00 ], t2; \
210+ EX( ldd [src - off - 0x08 ], t0); \
211+ EX( ldd [src - off - 0x00 ], t2); \
219212 addxcc t0, sum, sum; \
220- st t0, [dst - off - 0x08 ]; \
213+ EX( st t0, [dst - off - 0x08 ]); \
221214 addxcc t1, sum, sum; \
222- st t1, [dst - off - 0x04 ]; \
215+ EX( st t1, [dst - off - 0x04 ]); \
223216 addxcc t2, sum, sum; \
224- st t2, [dst - off - 0x00 ]; \
217+ EX( st t2, [dst - off - 0x00 ]); \
225218 addxcc t3, sum, sum; \
226- st t3, [dst - off + 0x04 ];
219+ EX( st t3, [dst - off + 0x04 ]) ;
227220
228221 /* Handle the end cruft code out of band for better cache patterns. */
229222cc_end_cruft:
@@ -331,7 +324,6 @@ __csum_partial_copy_sparc_generic:
331324 CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20 ,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
332325 CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40 ,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
333326 CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60 ,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
334- 10: EXT(5b, 10b) ! note for exception handling
335327 sub %g1, 128 , %g1 ! detract from length
336328 addx %g0, %g7, %g7 ! add in last carry bit
337329 andcc %g1, 0xffffff80 , %g0 ! more to csum?
@@ -356,8 +348,7 @@ cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
356348 CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28 ,%g2,%g3,%g4,%g5)
357349 CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18 ,%g2,%g3,%g4,%g5)
358350 CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08 ,%g2,%g3,%g4,%g5)
359- 12: EXT(cctbl, 12b) ! note for exception table handling
360- addx %g0, %g7, %g7
351+ 12: addx %g0, %g7, %g7
361352 andcc %o3, 0xf , %g0 ! check for low bits set
362353ccte: bne cc_end_cruft ! something left, handle it out of band
363354 andcc %o3, 8 , %g0 ! begin checks for that code
@@ -367,7 +358,6 @@ ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o
367358 CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20 ,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
368359 CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40 ,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
369360 CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60 ,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
370- 11: EXT(ccdbl, 11b) ! note for exception table handling
371361 sub %g1, 128 , %g1 ! detract from length
372362 addx %g0, %g7, %g7 ! add in last carry bit
373363 andcc %g1, 0xffffff80 , %g0 ! more to csum?
0 commit comments