Skip to content

Commit

Permalink
gfxlib2: add comments to x86/.s files
Browse files Browse the repository at this point in the history
- add comments only (no code changes)
- src/gfxlib2/x86/gfx_mmx.s:
- src/gfxlib2/x86/gfx_put_alpha_mmx.s
  • Loading branch information
jayrm committed Dec 22, 2023
1 parent dace83c commit 79b5fbd
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 119 deletions.
90 changes: 47 additions & 43 deletions src/gfxlib2/x86/gfx_mmx.s
Expand Up @@ -29,35 +29,39 @@ FUNC(fb_hMemCpyMMX)
movl %esp, %ebp
pushl %esi
pushl %edi

/* For best results, source and destination should be aligned to
** 4 byte memory address and length should be a multiple of 4 bytes
*/

movl ARG1, %edi
movl ARG2, %esi
movl ARG3, %ecx
shrl $1, %ecx
jnc memcpy_skip_1
movsb
movl ARG1, %edi /* edi = dst */
movl ARG2, %esi /* esi = src */
movl ARG3, %ecx /* ecx = len */
shrl $1, %ecx /* ecx = len \ 2 */
jnc memcpy_skip_1 /* if len mod 2 = 0, no 1 byte copy */
movsb /* copy 1 byte */

LABEL(memcpy_skip_1)
shrl $1, %ecx
jnc memcpy_skip_2
movsw
shrl $1, %ecx /* ecx = len \ 4 */
jnc memcpy_skip_2 /* if len mod 4 = 0, no 1 byte copy */
movsw /* copy 2 bytes */

LABEL(memcpy_skip_2)
shrl $1, %ecx
jnc memcpy_skip_4
movsl
shrl $1, %ecx /* ecx = len \ 8 */
jnc memcpy_skip_4 /* if len mod 5 = 0, no 2 byte copy */
movsl /* copy 4 bytes */

LABEL(memcpy_skip_4)
orl %ecx, %ecx
jz memcpy_end
jz memcpy_end /* if nothing left to copy, skip */

LABEL(memcpy_loop)
addl $8, %edi
movq (%esi), %mm0
addl $8, %esi
movq %mm0, -8(%edi)
decl %ecx
jnz memcpy_loop
addl $8, %edi /* dst ++ */
movq (%esi), %mm0 /* mm0 = *src (8 bytes) */
addl $8, %esi /* src ++ */
movq %mm0, -8(%edi) /* *dst = mm0 */
decl %ecx /* len -= 1 */
jnz memcpy_loop /* loop until all copied */

LABEL(memcpy_end)
emms
Expand All @@ -74,37 +78,37 @@ FUNC(fb_hMemSetMMX)
pushl %edi

movl ARG1, %edi /* edi = dst */
movl ARG2, %eax /* esi = sc */
movl ARG2, %eax /* esi = fill byte (Aa) */
movl ARG3, %ecx /* ecx = len */
movb %al, %ah
movw %ax, %dx
shll $16, %eax
movw %dx, %ax
shrl $1, %ecx
jnc memset_skip_1
stosb
movb %al, %ah /* eax = | ?? ?? Aa Aa | */
movw %ax, %dx /* edx = | ?? ?? Aa Aa | */
shll $16, %eax /* eax = | Aa Aa ?? ?? | */
movw %dx, %ax /* eax = | Aa Aa Aa Aa | */
shrl $1, %ecx /* ecx = len \ 2 */
jnc memset_skip_1 /* if len mod 2 = 0, no 1 byte copy */
stosb /* store 1 byte */

LABEL(memset_skip_1)
shrl $1, %ecx
jnc memset_skip_2
stosw
shrl $1, %ecx /* ecx = len \ 4 */
jnc memset_skip_2 /* if len mod 4 = 0, no 2 byte copy */
stosw /* store 2 bytes */

LABEL(memset_skip_2)
shrl $1, %ecx
jnc memset_skip_4
stosl
shrl $1, %ecx /* ecx = len \ 8 */
jnc memset_skip_4 /* if len mod 8 = 0, no 4 byte copy */
stosl /* store 4 bytes */

LABEL(memset_skip_4)
orl %ecx, %ecx
jz memset_end
movd %eax, %mm0
punpckldq %mm0, %mm0
jz memset_end /* if nothing left to fill, skip */
movd %eax, %mm0 /* mm0 = ____ | ____ | AaAa | AaAa */
punpckldq %mm0, %mm0 /* mm0 = AaAa | AaAa | AaAa | AaAa */

LABEL(memset_loop)
movq %mm0, (%edi)
addl $8, %edi
decl %ecx
jnz memset_loop
movq %mm0, (%edi) /* *dst = mm0 */
addl $8, %edi /* dst ++ */
decl %ecx /* len -= 1 */
jnz memset_loop /* loop until all copied */

LABEL(memset_end)
emms
Expand All @@ -120,7 +124,7 @@ FUNC(fb_hPixelSet2MMX)
pushl %edi

movl ARG1, %edi /* edi = dst */
movl ARG2, %eax /* esi = sc */
movl ARG2, %eax /* esi = fill pixel (2 bytes) */
movl ARG3, %ecx /* ecx = len */
movw %ax, %dx
shll $16, %eax
Expand Down Expand Up @@ -167,7 +171,7 @@ FUNC(fb_hPixelSet4MMX)
pushl %edi

movl ARG1, %edi /* edi = dst */
movl ARG2, %eax /* esi = sc */
movl ARG2, %eax /* esi = fill pixel (4 bytes) */
movl ARG3, %ecx /* ecx = len */
shrl $1, %ecx
jnc pixelset4_skip_1
Expand Down Expand Up @@ -217,7 +221,7 @@ FUNC(fb_hPixelSetAlpha4MMX)
pushl %edi
pushl %ebx

movl ARG2, %esi /* esi = sc */
movl ARG2, %esi /* esi = fill color (4 bytes) */
movl ARG1, %edi /* edi = dst */
movd %esi, %mm6 /* mm6 = esi */
movl ARG3, %ecx /* ecx = len */
Expand Down
173 changes: 97 additions & 76 deletions src/gfxlib2/x86/gfx_put_alpha_mmx.s
Expand Up @@ -14,92 +14,113 @@ FUNC(fb_hPutAlpha4MMX)
pushl %esi
pushl %edi
pushl %ebx

/* ARG1: unsigned char *src */
/* ARG2: unsigned char *dest */
/* ARG3: int w */
/* ARG4: int h */
/* ARG5: int src_pitch */
/* ARG6: int dest_pitch */
/* ARG7: int alpha */
/* ARG8: BLENDER *blender */
/* ARG9: void *param */

movl ARG3, %ebx
shll $2, %ebx
movl ARG4, %edx
subl %ebx, ARG5
movl %edx, LOCAL1
movl ARG1, %esi
movl ARG6, %edx
movl ARG2, %edi
subl %ebx, %edx
movl %edx, LOCAL2
movl ARG3, %ebx /* ebx = w */
shll $2, %ebx /* ebx = w * 4 */
movl ARG4, %edx /* edx = h */
subl %ebx, ARG5 /* src_pitch = src_pitch - w * 4 */
movl %edx, LOCAL1 /* LOCAL1 = h */
movl ARG1, %esi /* esi = src */
movl ARG6, %edx /* edx = dest_pitch */
movl ARG2, %edi /* edi = dst */
subl %ebx, %edx /* edx = dest_pitch - w * 4 */
movl %edx, LOCAL2 /* LOCAL2 = dest_pitch - w * 4 */
movq GLOBL(__fb_gfx_rb_32), %mm5

LABEL(alpha4_y_loop)
movl ARG3, %ecx
shrl $1, %ecx
jnc alpha4_skip_1
addl $4, %edi
lodsl
movl %eax, LOCAL3
movl -4(%edi), %ebx
movl %eax, %ecx
movl %ebx, %edx
andl $MASK_RB_32, %eax
andl $MASK_RB_32, %edx
shrl $24, LOCAL3
subl %edx, %eax
imull LOCAL3
xchg %eax, %ecx
movl %ebx, %edx
andl $MASK_GA_32, %eax
andl $MASK_GA_32, %edx
subl %edx, %eax
shrl $8, %eax
imull LOCAL3
shrl $8, %ecx
movl %ebx, %edx
andl $MASK_RB_32, %ebx
andl $MASK_GA_32, %edx
addl %ecx, %ebx
addl %edx, %eax
andl $MASK_RB_32, %ebx
andl $MASK_GA_32, %eax
orl %ebx, %eax
movl %eax, -4(%edi)
movl ARG3, %ecx /* ecx = w */
shrl $1, %ecx /* ecx = w \ 2 */
jnc alpha4_skip_1 /* if w \ 2 mod 2 = 0 skip */
addl $4, %edi /* dst += 4 */
lodsl /* eax = *src */
movl %eax, LOCAL3 /* LOCAL3 = *src */
movl -4(%edi), %ebx /* ebx = *dst */
movl %eax, %ecx /* ecx = *src */
movl %ebx, %edx /* edx = *dst */
andl $MASK_RB_32, %eax /* eax = __sr__sb (srb) */
andl $MASK_RB_32, %edx /* edx = __dr__db (drb) */
shrl $24, LOCAL3 /* LOCAL3 = ______aa (*src alpha) */
subl %edx, %eax /* eax = __sr__sb - __dr__db */
imull LOCAL3 /* eax = (srb - drb) * a */
xchg %eax, %ecx /* ecx = (srb - drb) * a, eax = *src */
movl %ebx, %edx /* edx = *dst */
andl $MASK_GA_32, %eax /* eax = sa__sg__ (sga) */
andl $MASK_GA_32, %edx /* edx = da__dg__ (dga) */
subl %edx, %eax /* eax = sa__sg__ - da__dg__ */
shrl $8, %eax /* eax = (sga - dga) >> 8 */
imull LOCAL3 /* eax = ((sga - dga) >> 8) * a */
shrl $8, %ecx /* ecx = ((srb - drb) * a) >> 8 */
movl %ebx, %edx /* edx = *dst */
andl $MASK_RB_32, %ebx /* ebx = __dr__db */
andl $MASK_GA_32, %edx /* edx = aa__gg__ */
addl %ecx, %ebx /* ebx += ((srb - drb) * a) >> 8 */
addl %edx, %eax /* eax += aa__gg__ */
andl $MASK_RB_32, %ebx /* ebx = __rr__bb */
andl $MASK_GA_32, %eax /* eax = aa__gg__ */
orl %ebx, %eax /* eax == aarrggbb */
movl %eax, -4(%edi) /* *(dst-4) = aarrggbb */

LABEL(alpha4_skip_1)
movl ARG3, %ecx
shrl $1, %ecx
jz alpha4_next_line
movl ARG3, %ecx /* ecx = w */
shrl $1, %ecx /* ecx = w \ 2 */
jz alpha4_next_line /* if w \ 2 = 0 next line */


movq %mm6, %mm0 /* mm0 = ssss ssss | ssss ssss */
movq (%edi), %mm1 /* mm1 = dddd dddd | dddd dddd */
movq %mm0, %mm2 /* mm2 = ssss ssss | ssss ssss */
movq %mm0, %mm3 /* mm3 = ssss ssss | ssss ssss */
movq %mm1, %mm4 /* mm4 = dddd dddd | dddd dddd */
psrld $24, %mm2 /* mm2 = ____ | __aa | ____ | __aa */
psrlw $8, %mm3 /* mm3 = __sa | __sg | __sa | __sg */
psrlw $8, %mm4 /* mm4 = __da | __dg | __da | __dg */
packssdw %mm2, %mm2 /* mm2 = __aa | __aa | __aa | __aa */

LABEL(alpha4_x_loop)
movq (%esi), %mm0
movq (%edi), %mm1
movq %mm0, %mm2
movq %mm0, %mm3
movq %mm1, %mm4
psrld $24, %mm2
psrlw $8, %mm3
psrlw $8, %mm4
packssdw %mm2, %mm2
pand %mm5, %mm0
pand %mm5, %mm1
punpcklwd %mm2, %mm2
psubw %mm1, %mm0
psubw %mm4, %mm3
pmullw %mm2, %mm0
pmullw %mm2, %mm3
psraw $8, %mm0
psraw $8, %mm3
paddw %mm1, %mm0
paddw %mm4, %mm3
pand %mm5, %mm0
psllw $8, %mm3
addl $8, %edi
por %mm3, %mm0
addl $8, %esi
movq %mm0, -8(%edi)
decl %ecx
jnz alpha4_x_loop
movq (%esi), %mm0 /* mm0 = ssss ssss | ssss ssss */
movq (%edi), %mm1 /* mm1 = dddd dddd | dddd dddd */
movq %mm0, %mm2 /* mm2 = ssss ssss | ssss ssss */
movq %mm0, %mm3 /* mm3 = ssss ssss | ssss ssss */
movq %mm1, %mm4 /* mm4 = dddd dddd | dddd dddd */
psrld $24, %mm2 /* mm2 = ____ | __sa | ____ | __sa */
psrlw $8, %mm3 /* mm3 = __sa | __sg | __sa | __sg */
psrlw $8, %mm4 /* mm4 = __da | __dg | __da | __dg */
packssdw %mm2, %mm2 /* mm2 = __sa | __sa | __sa | __sa */
pand %mm5, %mm0 /* mm0 = __sr | __sb | __sr | __sb */
pand %mm5, %mm1 /* mm1 = __dr | __db | __dr | __db */
punpcklwd %mm2, %mm2 /* mm2 = __sa | __sa | __sa | __sa */
psubw %mm1, %mm0 /* mm0 = sr-dr| sb-db| sr-dr| sb-db */
psubw %mm4, %mm3 /* mm3 = sa-da| sg-dg| sa-da| sg-dg */
pmullw %mm2, %mm0 /* mm0 = rr** | bb** | rr** | bb** */
pmullw %mm2, %mm3 /* mm3 = aa** | gg** | aa** | gg** */
psraw $8, %mm0 /* mm0 = __rr | __bb | __rr | __bb */
psraw $8, %mm3 /* mm3 = __aa | __gg | __aa | __gg */
paddw %mm1, %mm0 /* mm0 = rr++ | bb++ | rr++ | bb++ */
paddw %mm4, %mm3 /* mm3 = aa++ | gg++ | aa++ | gg++ */
pand %mm5, %mm0 /* mm9 = __rr | __bb | __rr | __bb */
psllw $8, %mm3 /* mm3 = aa__ | gg__ | aa__ | gg__ */
addl $8, %edi /* dst += 8 */
por %mm3, %mm0 /* mm0 = aarr ggbb | aarr ggbb */
addl $8, %esi /* src += 8 */
movq %mm0, -8(%edi) /* *(dst-1) = argb, argb */
decl %ecx /* w -= 1 */
jnz alpha4_x_loop /* if w != 0 goto alpha4_x_loop */

LABEL(alpha4_next_line)
addl ARG5, %esi
addl LOCAL2, %edi
decl LOCAL1
jnz alpha4_y_loop
addl ARG5, %esi /* src += src_pitch - w * 4 */
addl LOCAL2, %edi /* dst += dest_pitch - w * 4 */
decl LOCAL1 /* h -= 1 */
jnz alpha4_y_loop /* if h != 0 goto alpha4_y_loop */

emms
popl %ebx
Expand Down

0 comments on commit 79b5fbd

Please sign in to comment.