diff --git a/src/gfxlib2/x86/gfx_mmx.s b/src/gfxlib2/x86/gfx_mmx.s index a2e1a5083..6f7c217a3 100644 --- a/src/gfxlib2/x86/gfx_mmx.s +++ b/src/gfxlib2/x86/gfx_mmx.s @@ -29,35 +29,39 @@ FUNC(fb_hMemCpyMMX) movl %esp, %ebp pushl %esi pushl %edi + + /* For best results, source and destination should be aligned to + ** 4 byte memory address and length should be a multiple of 4 bytes + */ - movl ARG1, %edi - movl ARG2, %esi - movl ARG3, %ecx - shrl $1, %ecx - jnc memcpy_skip_1 - movsb + movl ARG1, %edi /* edi = dst */ + movl ARG2, %esi /* esi = src */ + movl ARG3, %ecx /* ecx = len */ + shrl $1, %ecx /* ecx = len \ 2 */ + jnc memcpy_skip_1 /* if len mod 2 = 0, no 1 byte copy */ + movsb /* copy 1 byte */ LABEL(memcpy_skip_1) - shrl $1, %ecx - jnc memcpy_skip_2 - movsw + shrl $1, %ecx /* ecx = len \ 4 */ + jnc memcpy_skip_2 /* if len mod 4 = 0, no 1 byte copy */ + movsw /* copy 2 bytes */ LABEL(memcpy_skip_2) - shrl $1, %ecx - jnc memcpy_skip_4 - movsl + shrl $1, %ecx /* ecx = len \ 8 */ + jnc memcpy_skip_4 /* if len mod 5 = 0, no 2 byte copy */ + movsl /* copy 4 bytes */ LABEL(memcpy_skip_4) orl %ecx, %ecx - jz memcpy_end + jz memcpy_end /* if nothing left to copy, skip */ LABEL(memcpy_loop) - addl $8, %edi - movq (%esi), %mm0 - addl $8, %esi - movq %mm0, -8(%edi) - decl %ecx - jnz memcpy_loop + addl $8, %edi /* dst ++ */ + movq (%esi), %mm0 /* mm0 = *src (8 bytes) */ + addl $8, %esi /* src ++ */ + movq %mm0, -8(%edi) /* *dst = mm0 */ + decl %ecx /* len -= 1 */ + jnz memcpy_loop /* loop until all copied */ LABEL(memcpy_end) emms @@ -74,37 +78,37 @@ FUNC(fb_hMemSetMMX) pushl %edi movl ARG1, %edi /* edi = dst */ - movl ARG2, %eax /* esi = sc */ + movl ARG2, %eax /* esi = fill byte (Aa) */ movl ARG3, %ecx /* ecx = len */ - movb %al, %ah - movw %ax, %dx - shll $16, %eax - movw %dx, %ax - shrl $1, %ecx - jnc memset_skip_1 - stosb + movb %al, %ah /* eax = | ?? ?? Aa Aa | */ + movw %ax, %dx /* edx = | ?? ?? Aa Aa | */ + shll $16, %eax /* eax = | Aa Aa ?? ?? | */ + movw %dx, %ax /* eax = | Aa Aa Aa Aa | */ + shrl $1, %ecx /* ecx = len \ 2 */ + jnc memset_skip_1 /* if len mod 2 = 0, no 1 byte copy */ + stosb /* store 1 byte */ LABEL(memset_skip_1) - shrl $1, %ecx - jnc memset_skip_2 - stosw + shrl $1, %ecx /* ecx = len \ 4 */ + jnc memset_skip_2 /* if len mod 4 = 0, no 2 byte copy */ + stosw /* store 2 bytes */ LABEL(memset_skip_2) - shrl $1, %ecx - jnc memset_skip_4 - stosl + shrl $1, %ecx /* ecx = len \ 8 */ + jnc memset_skip_4 /* if len mod 8 = 0, no 4 byte copy */ + stosl /* store 4 bytes */ LABEL(memset_skip_4) orl %ecx, %ecx - jz memset_end - movd %eax, %mm0 - punpckldq %mm0, %mm0 + jz memset_end /* if nothing left to fill, skip */ + movd %eax, %mm0 /* mm0 = ____ | ____ | AaAa | AaAa */ + punpckldq %mm0, %mm0 /* mm0 = AaAa | AaAa | AaAa | AaAa */ LABEL(memset_loop) - movq %mm0, (%edi) - addl $8, %edi - decl %ecx - jnz memset_loop + movq %mm0, (%edi) /* *dst = mm0 */ + addl $8, %edi /* dst ++ */ + decl %ecx /* len -= 1 */ + jnz memset_loop /* loop until all copied */ LABEL(memset_end) emms @@ -120,7 +124,7 @@ FUNC(fb_hPixelSet2MMX) pushl %edi movl ARG1, %edi /* edi = dst */ - movl ARG2, %eax /* esi = sc */ + movl ARG2, %eax /* esi = fill pixel (2 bytes) */ movl ARG3, %ecx /* ecx = len */ movw %ax, %dx shll $16, %eax @@ -167,7 +171,7 @@ FUNC(fb_hPixelSet4MMX) pushl %edi movl ARG1, %edi /* edi = dst */ - movl ARG2, %eax /* esi = sc */ + movl ARG2, %eax /* esi = fill pixel (4 bytes) */ movl ARG3, %ecx /* ecx = len */ shrl $1, %ecx jnc pixelset4_skip_1 @@ -217,7 +221,7 @@ FUNC(fb_hPixelSetAlpha4MMX) pushl %edi pushl %ebx - movl ARG2, %esi /* esi = sc */ + movl ARG2, %esi /* esi = fill color (4 bytes) */ movl ARG1, %edi /* edi = dst */ movd %esi, %mm6 /* mm6 = esi */ movl ARG3, %ecx /* ecx = len */ diff --git a/src/gfxlib2/x86/gfx_put_alpha_mmx.s b/src/gfxlib2/x86/gfx_put_alpha_mmx.s index 8af3c1b2f..540be604b 100644 --- a/src/gfxlib2/x86/gfx_put_alpha_mmx.s +++ b/src/gfxlib2/x86/gfx_put_alpha_mmx.s @@ -14,92 +14,113 @@ FUNC(fb_hPutAlpha4MMX) pushl %esi pushl %edi pushl %ebx + + /* ARG1: unsigned char *src */ + /* ARG2: unsigned char *dest */ + /* ARG3: int w */ + /* ARG4: int h */ + /* ARG5: int src_pitch */ + /* ARG6: int dest_pitch */ + /* ARG7: int alpha */ + /* ARG8: BLENDER *blender */ + /* ARG9: void *param */ - movl ARG3, %ebx - shll $2, %ebx - movl ARG4, %edx - subl %ebx, ARG5 - movl %edx, LOCAL1 - movl ARG1, %esi - movl ARG6, %edx - movl ARG2, %edi - subl %ebx, %edx - movl %edx, LOCAL2 + movl ARG3, %ebx /* ebx = w */ + shll $2, %ebx /* ebx = w * 4 */ + movl ARG4, %edx /* edx = h */ + subl %ebx, ARG5 /* src_pitch = src_pitch - w * 4 */ + movl %edx, LOCAL1 /* LOCAL1 = h */ + movl ARG1, %esi /* esi = src */ + movl ARG6, %edx /* edx = dest_pitch */ + movl ARG2, %edi /* edi = dst */ + subl %ebx, %edx /* edx = dest_pitch - w * 4 */ + movl %edx, LOCAL2 /* LOCAL2 = dest_pitch - w * 4 */ movq GLOBL(__fb_gfx_rb_32), %mm5 LABEL(alpha4_y_loop) - movl ARG3, %ecx - shrl $1, %ecx - jnc alpha4_skip_1 - addl $4, %edi - lodsl - movl %eax, LOCAL3 - movl -4(%edi), %ebx - movl %eax, %ecx - movl %ebx, %edx - andl $MASK_RB_32, %eax - andl $MASK_RB_32, %edx - shrl $24, LOCAL3 - subl %edx, %eax - imull LOCAL3 - xchg %eax, %ecx - movl %ebx, %edx - andl $MASK_GA_32, %eax - andl $MASK_GA_32, %edx - subl %edx, %eax - shrl $8, %eax - imull LOCAL3 - shrl $8, %ecx - movl %ebx, %edx - andl $MASK_RB_32, %ebx - andl $MASK_GA_32, %edx - addl %ecx, %ebx - addl %edx, %eax - andl $MASK_RB_32, %ebx - andl $MASK_GA_32, %eax - orl %ebx, %eax - movl %eax, -4(%edi) + movl ARG3, %ecx /* ecx = w */ + shrl $1, %ecx /* ecx = w \ 2 */ + jnc alpha4_skip_1 /* if w \ 2 mod 2 = 0 skip */ + addl $4, %edi /* dst += 4 */ + lodsl /* eax = *src */ + movl %eax, LOCAL3 /* LOCAL3 = *src */ + movl -4(%edi), %ebx /* ebx = *dst */ + movl %eax, %ecx /* ecx = *src */ + movl %ebx, %edx /* edx = *dst */ + andl $MASK_RB_32, %eax /* eax = __sr__sb (srb) */ + andl $MASK_RB_32, %edx /* edx = __dr__db (drb) */ + shrl $24, LOCAL3 /* LOCAL3 = ______aa (*src alpha) */ + subl %edx, %eax /* eax = __sr__sb - __dr__db */ + imull LOCAL3 /* eax = (srb - drb) * a */ + xchg %eax, %ecx /* ecx = (srb - drb) * a, eax = *src */ + movl %ebx, %edx /* edx = *dst */ + andl $MASK_GA_32, %eax /* eax = sa__sg__ (sga) */ + andl $MASK_GA_32, %edx /* edx = da__dg__ (dga) */ + subl %edx, %eax /* eax = sa__sg__ - da__dg__ */ + shrl $8, %eax /* eax = (sga - dga) >> 8 */ + imull LOCAL3 /* eax = ((sga - dga) >> 8) * a */ + shrl $8, %ecx /* ecx = ((srb - drb) * a) >> 8 */ + movl %ebx, %edx /* edx = *dst */ + andl $MASK_RB_32, %ebx /* ebx = __dr__db */ + andl $MASK_GA_32, %edx /* edx = aa__gg__ */ + addl %ecx, %ebx /* ebx += ((srb - drb) * a) >> 8 */ + addl %edx, %eax /* eax += aa__gg__ */ + andl $MASK_RB_32, %ebx /* ebx = __rr__bb */ + andl $MASK_GA_32, %eax /* eax = aa__gg__ */ + orl %ebx, %eax /* eax == aarrggbb */ + movl %eax, -4(%edi) /* *(dst-4) = aarrggbb */ LABEL(alpha4_skip_1) - movl ARG3, %ecx - shrl $1, %ecx - jz alpha4_next_line + movl ARG3, %ecx /* ecx = w */ + shrl $1, %ecx /* ecx = w \ 2 */ + jz alpha4_next_line /* if w \ 2 = 0 next line */ + + + movq %mm6, %mm0 /* mm0 = ssss ssss | ssss ssss */ + movq (%edi), %mm1 /* mm1 = dddd dddd | dddd dddd */ + movq %mm0, %mm2 /* mm2 = ssss ssss | ssss ssss */ + movq %mm0, %mm3 /* mm3 = ssss ssss | ssss ssss */ + movq %mm1, %mm4 /* mm4 = dddd dddd | dddd dddd */ + psrld $24, %mm2 /* mm2 = ____ | __aa | ____ | __aa */ + psrlw $8, %mm3 /* mm3 = __sa | __sg | __sa | __sg */ + psrlw $8, %mm4 /* mm4 = __da | __dg | __da | __dg */ + packssdw %mm2, %mm2 /* mm2 = __aa | __aa | __aa | __aa */ LABEL(alpha4_x_loop) - movq (%esi), %mm0 - movq (%edi), %mm1 - movq %mm0, %mm2 - movq %mm0, %mm3 - movq %mm1, %mm4 - psrld $24, %mm2 - psrlw $8, %mm3 - psrlw $8, %mm4 - packssdw %mm2, %mm2 - pand %mm5, %mm0 - pand %mm5, %mm1 - punpcklwd %mm2, %mm2 - psubw %mm1, %mm0 - psubw %mm4, %mm3 - pmullw %mm2, %mm0 - pmullw %mm2, %mm3 - psraw $8, %mm0 - psraw $8, %mm3 - paddw %mm1, %mm0 - paddw %mm4, %mm3 - pand %mm5, %mm0 - psllw $8, %mm3 - addl $8, %edi - por %mm3, %mm0 - addl $8, %esi - movq %mm0, -8(%edi) - decl %ecx - jnz alpha4_x_loop + movq (%esi), %mm0 /* mm0 = ssss ssss | ssss ssss */ + movq (%edi), %mm1 /* mm1 = dddd dddd | dddd dddd */ + movq %mm0, %mm2 /* mm2 = ssss ssss | ssss ssss */ + movq %mm0, %mm3 /* mm3 = ssss ssss | ssss ssss */ + movq %mm1, %mm4 /* mm4 = dddd dddd | dddd dddd */ + psrld $24, %mm2 /* mm2 = ____ | __sa | ____ | __sa */ + psrlw $8, %mm3 /* mm3 = __sa | __sg | __sa | __sg */ + psrlw $8, %mm4 /* mm4 = __da | __dg | __da | __dg */ + packssdw %mm2, %mm2 /* mm2 = __sa | __sa | __sa | __sa */ + pand %mm5, %mm0 /* mm0 = __sr | __sb | __sr | __sb */ + pand %mm5, %mm1 /* mm1 = __dr | __db | __dr | __db */ + punpcklwd %mm2, %mm2 /* mm2 = __sa | __sa | __sa | __sa */ + psubw %mm1, %mm0 /* mm0 = sr-dr| sb-db| sr-dr| sb-db */ + psubw %mm4, %mm3 /* mm3 = sa-da| sg-dg| sa-da| sg-dg */ + pmullw %mm2, %mm0 /* mm0 = rr** | bb** | rr** | bb** */ + pmullw %mm2, %mm3 /* mm3 = aa** | gg** | aa** | gg** */ + psraw $8, %mm0 /* mm0 = __rr | __bb | __rr | __bb */ + psraw $8, %mm3 /* mm3 = __aa | __gg | __aa | __gg */ + paddw %mm1, %mm0 /* mm0 = rr++ | bb++ | rr++ | bb++ */ + paddw %mm4, %mm3 /* mm3 = aa++ | gg++ | aa++ | gg++ */ + pand %mm5, %mm0 /* mm9 = __rr | __bb | __rr | __bb */ + psllw $8, %mm3 /* mm3 = aa__ | gg__ | aa__ | gg__ */ + addl $8, %edi /* dst += 8 */ + por %mm3, %mm0 /* mm0 = aarr ggbb | aarr ggbb */ + addl $8, %esi /* src += 8 */ + movq %mm0, -8(%edi) /* *(dst-1) = argb, argb */ + decl %ecx /* w -= 1 */ + jnz alpha4_x_loop /* if w != 0 goto alpha4_x_loop */ LABEL(alpha4_next_line) - addl ARG5, %esi - addl LOCAL2, %edi - decl LOCAL1 - jnz alpha4_y_loop + addl ARG5, %esi /* src += src_pitch - w * 4 */ + addl LOCAL2, %edi /* dst += dest_pitch - w * 4 */ + decl LOCAL1 /* h -= 1 */ + jnz alpha4_y_loop /* if h != 0 goto alpha4_y_loop */ emms popl %ebx