From 485b2b5b4336ebb94fedc6db183f24375ec3abf8 Mon Sep 17 00:00:00 2001 From: Licai Guo Date: Wed, 30 Apr 2014 15:54:49 +0800 Subject: [PATCH 1/2] Add IntraSad asm code. Enable intraSad ASM code Refine format Add X86_ASM pretect for intraSad ASM code UT remove duplicated code. --- codec/common/x86/asm_inc.asm | 3 + codec/common/x86/satd_sad.asm | 810 +++++++++++++++++--------- codec/encoder/core/inc/sample.h | 5 + codec/encoder/core/src/sample.cpp | 8 +- codec/encoder/core/x86/intra_pred.asm | 249 +------- test/encoder/EncUT_Sample.cpp | 129 ++++ 6 files changed, 692 insertions(+), 512 deletions(-) diff --git a/codec/common/x86/asm_inc.asm b/codec/common/x86/asm_inc.asm index ff1ae18cec..fc18ab2281 100644 --- a/codec/common/x86/asm_inc.asm +++ b/codec/common/x86/asm_inc.asm @@ -100,6 +100,7 @@ BITS 64 %define r1w dx %define r2w r8w %define r3w r9w +%define r6w r11w %define r0b cl %define r1b dl @@ -149,6 +150,7 @@ BITS 64 %define r1w si %define r2w dx %define r3w cx +%define r6w r10w %define r0b dil %define r1b sil @@ -198,6 +200,7 @@ BITS 32 %define r1w cx %define r2w dx %define r3w bx +%define r6w bp %define r0b al %define r1b cl diff --git a/codec/common/x86/satd_sad.asm b/codec/common/x86/satd_sad.asm index 3e47e5862f..5ef5a8fe11 100644 --- a/codec/common/x86/satd_sad.asm +++ b/codec/common/x86/satd_sad.asm @@ -353,6 +353,283 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 ; ;*********************************************************************** + +%macro SSE_DB_1_2REG 2 + pxor %1, %1 + pcmpeqw %2, %2 + psubb %1, %2 +%endmacro + +;*********************************************************************** +; +;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, +; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatdThree4x4_sse2 + +%ifdef X86_32 + push r3 + push r4 + push r5 + push r6 + %assign push_num 4 +%else + %assign push_num 0 +%endif + PUSH_XMM 8 + + mov r2, arg3 + mov r3, arg4 + SIGN_EXTENSION r3, r3d + + ; load source 4x4 samples and Hadamard transform + movd xmm0, [r2] + movd xmm1, [r2+r3] + lea r2 , [r2+2*r3] + movd xmm2, [r2] + movd xmm3, [r2+r3] + punpckldq xmm0, xmm2 + punpckldq xmm1, xmm3 + + pxor xmm6, xmm6 + punpcklbw xmm0, xmm6 + punpcklbw xmm1, xmm6 + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + SSE2_XSawp qdq, xmm0, xmm2, xmm3 + + movdqa xmm4, xmm0 + paddw xmm0, xmm3 + psubw xmm4, xmm3 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm4 + punpckhwd xmm4, xmm2 + + SSE2_XSawp dq, xmm0, xmm4, xmm3 + SSE2_XSawp qdq, xmm0, xmm3, xmm5 + + movdqa xmm7, xmm0 + paddw xmm0, xmm5 + psubw xmm7, xmm5 + + SSE2_XSawp qdq, xmm0, xmm7, xmm1 + + ; Hadamard transform results are saved in xmm0 and xmm2 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + + ;load top boundary samples: [a b c d] + mov r0, arg1 + mov r1, arg2 + SIGN_EXTENSION r1, r1d + sub r0, r1 +%ifdef UNIX64 + push r4 + push r5 +%endif + + movzx r2d, byte [r0] + movzx r3d, byte [r0+1] + movzx r4d, byte [r0+2] + movzx r5d, byte [r0+3] + + ; get the transform results of top boundary samples: [a b c d] + add r3d, r2d ; r3d = a + b + add r5d, r4d ; r5d = c + d + add r2d, r2d ; r2d = a + a + add r4d, r4d ; r4d = c + c + sub r2d, r3d ; r2d = a + a - a - b = a - b + sub r4d, r5d ; r4d = c + c - c - d = c - d + add r5d, r3d ; r5d = (a + b) + (c + d) + add r3d, r3d + sub r3d, r5d ; r3d = (a + b) - (c + d) + add r4d, r2d ; r4d = (a - b) + (c - d) + add r2d, r2d + sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] + + movdqa xmm6, xmm0 + movdqa xmm7, xmm2 + movd xmm5, r5d ; store the edi for DC mode + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pinsrw xmm3, r5d, 0 + pinsrw xmm3, r4d, 4 + psllw xmm3, 2 + pinsrw xmm4, r3d, 0 + pinsrw xmm4, r2d, 4 + psllw xmm4, 2 + + ; get the satd of H + psubw xmm0, xmm3 + psubw xmm2, xmm4 + + WELS_AbsW xmm0, xmm1 + WELS_AbsW xmm2, xmm1 + paddusw xmm0, xmm2 + SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0 + + ;load left boundary samples: [a b c d]' + add r0, r1 + + movzx r2d, byte [r0-1] + movzx r3d, byte [r0+r1-1] + lea r0 , [r0+2*r1] + movzx r4d, byte [r0-1] + movzx r5d, byte [r0+r1-1] + + ; get the transform results of left boundary samples: [a b c d]' + add r3d, r2d ; r3d = a + b + add r5d, r4d ; r5d = c + d + add r2d, r2d ; r2d = a + a + add r4d, r4d ; r4d = c + c + sub r2d, r3d ; r2d = a + a - a - b = a - b + sub r4d, r5d ; r4d = c + c - c - d = c - d + add r5d, r3d ; r5d = (a + b) + (c + d) + add r3d, r3d + sub r3d, r5d ; r3d = (a + b) - (c + d) + add r4d, r2d ; r4d = (a - b) + (c - d) + add r2d, r2d + sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] + + ; store the transform results in xmm3 + movd xmm3, r5d + pinsrw xmm3, r3d, 1 + pinsrw xmm3, r2d, 2 + pinsrw xmm3, r4d, 3 + psllw xmm3, 2 + + ; get the satd of V + movdqa xmm2, xmm6 + movdqa xmm4, xmm7 + psubw xmm2, xmm3 + WELS_AbsW xmm2, xmm1 + WELS_AbsW xmm4, xmm1 + paddusw xmm2, xmm4 + SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2 + + ; DC result is stored in xmm1 + add r5d, 4 + movd xmm1, r5d + paddw xmm1, xmm5 + psrlw xmm1, 3 + movdqa xmm5, xmm1 + psllw xmm1, 4 + + ; get the satd of DC + psubw xmm6, xmm1 + WELS_AbsW xmm6, xmm1 + WELS_AbsW xmm7, xmm1 + paddusw xmm6, xmm7 + SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6 +%ifdef UNIX64 + pop r5 + pop r4 +%endif + ; comparing order: DC H V + + mov r4, arg5 + movd r2d, xmm6 + movd r3d, xmm2 + movd r6d, xmm0 + + and r2d, 0xffff + shr r2d, 1 + and r3d, 0xffff + shr r3d, 1 + and r6d, 0xffff + shr r6d, 1 + add r2d, dword arg7 + add r3d, dword arg8 + add r6d, dword arg9 + cmp r2w, r3w + jg near not_dc + cmp r2w, r6w + jg near not_dc_h + + ; for DC mode + movd r3d, xmm5 + imul r3d, 0x01010101 + movd xmm5, r3d + pshufd xmm5, xmm5, 0 + movdqa [r4], xmm5 + mov r5, arg6 + mov dword [r5], 0x02 + mov retrd, r2d + POP_XMM +%ifdef X86_32 + pop r6 + pop r5 + pop r4 + pop r3 +%endif + ret + +not_dc: + cmp r3w, r6w + jg near not_dc_h + + ; for H mode + SSE_DB_1_2REG xmm6, xmm7 + sub r0, r1 + sub r0, r1 + movzx r6d, byte [r0-1] + movd xmm0, r6d + pmuludq xmm0, xmm6 + + movzx r6d, byte [r0+r1-1] + movd xmm1, r6d + pmuludq xmm1, xmm6 + punpckldq xmm0, xmm1 + + lea r0, [r0+r1*2] + movzx r6d, byte [r0-1] + movd xmm2, r6d + pmuludq xmm2, xmm6 + + movzx r6d, byte [r0+r1-1] + movd xmm3, r6d + pmuludq xmm3, xmm6 + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + + movdqa [r4],xmm0 + + mov retrd, r3d + mov r5, arg6 + mov dword [r5], 0x01 + POP_XMM +%ifdef X86_32 + pop r6 + pop r5 + pop r4 + pop r3 +%endif + ret +not_dc_h: + sub r0, r1 + sub r0, r1 + sub r0, r1 + movd xmm0, [r0] + pshufd xmm0, xmm0, 0 + movdqa [r4],xmm0 + mov retrd, r6d + mov r5, arg6 + mov dword [r5], 0x00 + POP_XMM +%ifdef X86_32 + pop r6 + pop r5 + pop r4 + pop r3 +%endif + ret + + %macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 pmaddubsw %1, xmm5 movdqa %2, %1 @@ -390,12 +667,12 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 %macro SSE41_GetX38x4SatdDec 0 pxor xmm7, xmm7 - movq xmm0, [eax] - movq xmm1, [eax+ebx] - lea eax, [eax+2*ebx] - movq xmm2, [eax] - movq xmm3, [eax+ebx] - lea eax, [eax+2*ebx] + movq xmm0, [r2] + movq xmm1, [r2+r3] + lea r2, [r2+2*r3] + movq xmm2, [r2] + movq xmm3, [r2+r3] + lea r2, [r2+2*r3] punpcklbw xmm0, xmm7 punpcklbw xmm1, xmm7 punpcklbw xmm2, xmm7 @@ -405,34 +682,35 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 ;doesn't need another transpose %endmacro + %macro SSE41_GetX38x4SatdV 2 pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2], 0 - pinsrw xmm0, word[esi+%2+8], 4 + pinsrw xmm0, word[r6+%2], 0 + pinsrw xmm0, word[r6+%2+8], 4 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2+2], 0 - pinsrw xmm0, word[esi+%2+10], 4 + pinsrw xmm0, word[r6+%2+2], 0 + pinsrw xmm0, word[r6+%2+10], 4 psubsw xmm0, xmm1 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2+4], 0 - pinsrw xmm0, word[esi+%2+12], 4 + pinsrw xmm0, word[r6+%2+4], 0 + pinsrw xmm0, word[r6+%2+12], 4 psubsw xmm0, xmm3 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2+6], 0 - pinsrw xmm0, word[esi+%2+14], 4 + pinsrw xmm0, word[r6+%2+6], 0 + pinsrw xmm0, word[r6+%2+14], 4 psubsw xmm0, xmm2 pabsw xmm0, xmm0 paddw xmm4, xmm0 %endmacro %macro SSE41_GetX38x4SatdH 3 - movq xmm0, [esi+%3+8*%1] + movq xmm0, [r6+%3+8*%1] punpcklqdq xmm0, xmm0 psubsw xmm0, xmm7 pabsw xmm0, xmm0 @@ -455,7 +733,7 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 %endmacro %macro SSE41_ChromaGetX38x4SatdDC 1 shl %1, 4 - movdqa xmm0, [esi+32+%1] + movdqa xmm0, [r6+32+%1] psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm6, xmm0 @@ -481,83 +759,93 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 paddd %1, %3 %endmacro - -%ifdef X86_32 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 - push ebx - push esi - push edi - mov ecx, [esp+16] - mov edx, [esp+20] - mov eax, [esp+24] - mov ebx, [esp+28] - mov esi, [esp+40] ;temp_satd + %assign push_num 0 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d + +%ifndef X86_32 + push r12 + mov r12, r2 +%endif + pxor xmm4, xmm4 movdqa xmm5, [HSumSubDB1] movdqa xmm6, [HSumSubDW1] movdqa xmm7, [PDW1] - sub ecx, edx - movdqu xmm0, [ecx] + sub r0, r1 + movdqu xmm0, [r0] movhlps xmm1, xmm0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 - movdqa [esi], xmm0 ;V - movdqa [esi+16], xmm1 - add ecx, edx - pinsrb xmm0, byte[ecx-1], 0 - pinsrb xmm0, byte[ecx+edx-1], 1 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 2 - pinsrb xmm0, byte[ecx+edx-1], 3 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 4 - pinsrb xmm0, byte[ecx+edx-1], 5 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 6 - pinsrb xmm0, byte[ecx+edx-1], 7 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 8 - pinsrb xmm0, byte[ecx+edx-1], 9 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 10 - pinsrb xmm0, byte[ecx+edx-1], 11 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 12 - pinsrb xmm0, byte[ecx+edx-1], 13 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 14 - pinsrb xmm0, byte[ecx+edx-1], 15 + movdqa [r6], xmm0 ;V + movdqa [r6+16], xmm1 + add r0, r1 + pinsrb xmm0, byte[r0-1], 0 + pinsrb xmm0, byte[r0+r1-1], 1 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 2 + pinsrb xmm0, byte[r0+r1-1], 3 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 4 + pinsrb xmm0, byte[r0+r1-1], 5 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 6 + pinsrb xmm0, byte[r0+r1-1], 7 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 8 + pinsrb xmm0, byte[r0+r1-1], 9 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 10 + pinsrb xmm0, byte[r0+r1-1], 11 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 12 + pinsrb xmm0, byte[r0+r1-1], 13 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 14 + pinsrb xmm0, byte[r0+r1-1], 15 movhlps xmm1, xmm0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 - movdqa [esi+32], xmm0 ;H - movdqa [esi+48], xmm1 - movd ecx, xmm4 ;dc - add ecx, 16 ;(sum+16) - shr ecx, 5 ;((sum+16)>>5) - shl ecx, 4 ; - movd mm4, ecx ; mm4 copy DC + movdqa [r6+32], xmm0 ;H + movdqa [r6+48], xmm1 + movd r0d, xmm4 ;dc + add r0d, 16 ;(sum+16) + shr r0d, 5 ;((sum+16)>>5) + shl r0d, 4 ; + movd mm4, r0d ; mm4 copy DC pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC - mov ecx, 0 - mov edi, 0 +%ifdef UNIX64 + push r4 +%endif + mov r0, 0 + mov r4, 0 + .loop16x16_get_satd: .loopStart1: - SSE41_I16x16GetX38x4Satd ecx, edi - inc ecx - cmp ecx, 4 + SSE41_I16x16GetX38x4Satd r0, r4 + inc r0 + cmp r0, 4 jl .loopStart1 - cmp edi, 16 + cmp r4, 16 je .loop16x16_get_satd_end - mov eax, [esp+24] - add eax, 8 - mov ecx, 0 - add edi, 16 +%ifdef X86_32 + mov r2, arg3 +%else + mov r2, r12 +%endif + add r2, 8 + mov r0, 0 + add r4, 16 jmp .loop16x16_get_satd .loop16x16_get_satd_end: MMX_DW_1_2REG xmm0, xmm1 @@ -568,66 +856,70 @@ WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 SSE41_HSum8W xmm5, xmm0, xmm1 SSE41_HSum8W xmm6, xmm0, xmm1 +%ifdef UNIX64 + pop r4 +%endif ; comparing order: DC H V - movd ebx, xmm6 ;DC - movd edi, xmm5 ;H - movd ecx, xmm4 ;V - mov edx, [esp+36] - shl edx, 1 - add edi, edx - add ebx, edx - mov edx, [esp+32] - cmp ebx, edi + movd r3d, xmm6 ;DC + movd r1d, xmm5 ;H + movd r0d, xmm4 ;V +%ifndef X86_32 + pop r12 +%endif + shl r5d, 1 + add r1d, r5d + add r3d, r5d + mov r4, arg5 + cmp r3d, r1d jge near not_dc_16x16 - cmp ebx, ecx + cmp r3d, r0d jge near not_dc_h_16x16 ; for DC mode - mov dword[edx], 2;I16_PRED_DC - mov eax, ebx + mov dword[r4], 2;I16_PRED_DC + mov retrd, r3d jmp near return_satd_intra_16x16_x3 not_dc_16x16: ; for H mode - cmp edi, ecx + cmp r1d, r0d jge near not_dc_h_16x16 - mov dword[edx], 1;I16_PRED_H - mov eax, edi + mov dword[r4], 1;I16_PRED_H + mov retrd, r1d jmp near return_satd_intra_16x16_x3 not_dc_h_16x16: ; for V mode - mov dword[edx], 0;I16_PRED_V - mov eax, ecx + mov dword[r4], 0;I16_PRED_V + mov retrd, r0d return_satd_intra_16x16_x3: WELSEMMS - pop edi - pop esi - pop ebx + POP_XMM + LOAD_7_PARA_POP ret %macro SSE41_ChromaGetX38x8Satd 0 movdqa xmm5, [HSumSubDB1] movdqa xmm6, [HSumSubDW1] movdqa xmm7, [PDW1] - sub ecx, edx - movq xmm0, [ecx] + sub r0, r1 + movq xmm0, [r0] punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 - movdqa [esi], xmm0 ;V - add ecx, edx - pinsrb xmm0, byte[ecx-1], 0 - pinsrb xmm0, byte[ecx+edx-1], 1 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 2 - pinsrb xmm0, byte[ecx+edx-1], 3 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 4 - pinsrb xmm0, byte[ecx+edx-1], 5 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 6 - pinsrb xmm0, byte[ecx+edx-1], 7 + movdqa [r6], xmm0 ;V + add r0, r1 + pinsrb xmm0, byte[r0-1], 0 + pinsrb xmm0, byte[r0+r1-1], 1 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 2 + pinsrb xmm0, byte[r0+r1-1], 3 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 4 + pinsrb xmm0, byte[r0+r1-1], 5 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 6 + pinsrb xmm0, byte[r0+r1-1], 7 punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 - movdqa [esi+16], xmm0 ;H + movdqa [r6+16], xmm0 ;H ;(sum+2)>>2 movdqa xmm6, [PDQ2] movdqa xmm5, xmm4 @@ -647,21 +939,19 @@ ret punpcklqdq xmm4, xmm5 psllq xmm4, 32 psrlq xmm4, 32 - movdqa [esi+32], xmm4 + movdqa [r6+32], xmm4 punpckhqdq xmm5, xmm6 psllq xmm5, 32 psrlq xmm5, 32 - movdqa [esi+48], xmm5 + movdqa [r6+48], xmm5 pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC - mov ecx, 0 -loop_chroma_satdx3_cb_cr: - SSE41_ChromaGetX38x4Satd ecx, 0 - inc ecx - cmp ecx, 2 - jl loop_chroma_satdx3_cb_cr + mov r0, 0 + SSE41_ChromaGetX38x4Satd r0, 0 + inc r0 + SSE41_ChromaGetX38x4Satd r0, 0 %endmacro %macro SSEReg2MMX 3 @@ -677,27 +967,22 @@ loop_chroma_satdx3_cb_cr: ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 - push ebx - push esi - push edi - mov ecx, [esp+16] - mov edx, [esp+20] - mov eax, [esp+24] - mov ebx, [esp+28] - mov esi, [esp+40] ;temp_satd - xor edi, edi + %assign push_num 0 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d loop_chroma_satdx3: SSE41_ChromaGetX38x8Satd - cmp edi, 1 - je loop_chroma_satdx3end - inc edi SSEReg2MMX xmm4, mm0,mm1 SSEReg2MMX xmm5, mm2,mm3 SSEReg2MMX xmm6, mm5,mm6 - mov ecx, [esp+44] - mov eax, [esp+48] - jmp loop_chroma_satdx3 -loop_chroma_satdx3end: + mov r0, arg8 + mov r2, arg9 + + SSE41_ChromaGetX38x8Satd + MMXReg2SSE xmm0, xmm3, mm0, mm1 MMXReg2SSE xmm1, xmm3, mm2, mm3 MMXReg2SSE xmm2, xmm3, mm5, mm6 @@ -714,39 +999,38 @@ loop_chroma_satdx3end: SSE41_HSum8W xmm5, xmm0, xmm1 SSE41_HSum8W xmm6, xmm0, xmm1 ; comparing order: DC H V - movd ebx, xmm6 ;DC - movd edi, xmm5 ;H - movd ecx, xmm4 ;V - mov edx, [esp+36] - shl edx, 1 - add edi, edx - add ecx, edx - mov edx, [esp+32] - cmp ebx, edi + movd r3d, xmm6 ;DC + movd r1d, xmm5 ;H + movd r0d, xmm4 ;V + + + shl r5d, 1 + add r1d, r5d + add r0d, r5d + cmp r3d, r1d jge near not_dc_8x8 - cmp ebx, ecx + cmp r3d, r0d jge near not_dc_h_8x8 ; for DC mode - mov dword[edx], 0;I8_PRED_DC - mov eax, ebx + mov dword[r4], 0;I8_PRED_DC + mov retrd, r3d jmp near return_satd_intra_8x8_x3 not_dc_8x8: ; for H mode - cmp edi, ecx + cmp r1d, r0d jge near not_dc_h_8x8 - mov dword[edx], 1;I8_PRED_H - mov eax, edi + mov dword[r4], 1;I8_PRED_H + mov retrd, r1d jmp near return_satd_intra_8x8_x3 not_dc_h_8x8: ; for V mode - mov dword[edx], 2;I8_PRED_V - mov eax, ecx + mov dword[r4], 2;I8_PRED_V + mov retrd, r0d return_satd_intra_8x8_x3: WELSEMMS - pop edi - pop esi - pop ebx + POP_XMM + LOAD_7_PARA_POP ret @@ -769,9 +1053,9 @@ ret paddw xmm3,xmm6 %endmacro %macro WelsAddDCValue 4 - movzx %2, byte %1 - mov %3, %2 - add %4, %2 + movzx %2, byte %1 + mov %3, %2 + add %4, %2 %endmacro ;*********************************************************************** @@ -780,133 +1064,139 @@ ret ; ;*********************************************************************** WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 - push ebx - push esi - push edi - mov ecx, [esp+16] - mov edx, [esp+20] - mov edi, [esp+40] ;temp_sad - sub ecx, edx - movdqa xmm5,[ecx] - pxor xmm0,xmm0 - psadbw xmm0,xmm5 - movhlps xmm1,xmm0 - paddw xmm0,xmm1 - movd eax,xmm0 - - add ecx,edx - lea ebx, [edx+2*edx] - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - lea ecx, [ecx+4*edx] - add edi, 64 - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - lea ecx, [ecx+4*edx] - add edi, 64 - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - lea ecx, [ecx+4*edx] - add edi, 64 - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - sub edi, 192 - add eax,10h - shr eax,5 - movd xmm7,eax - pxor xmm1,xmm1 - pshufb xmm7,xmm1 - pxor xmm4,xmm4 - pxor xmm3,xmm3 - pxor xmm2,xmm2 -;sad begin - mov eax, [esp+24] - mov ebx, [esp+28] - lea esi, [ebx+2*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - add edi, 64 - lea eax, [eax+4*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - add edi, 64 - lea eax, [eax+4*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - add edi, 64 - lea eax, [eax+4*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - - pslldq xmm3,4 - por xmm3,xmm2 - movhlps xmm1,xmm3 - paddw xmm3,xmm1 - movhlps xmm0,xmm4 - paddw xmm4,xmm0 -; comparing order: DC H V - movd ebx, xmm4 ;DC - movd ecx, xmm3 ;V + %assign push_num 0 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d + + push r5 + push r4 + push r3 + + sub r0, r1 + movdqa xmm5,[r0] + pxor xmm0,xmm0 + psadbw xmm0,xmm5 + movhlps xmm1,xmm0 + paddw xmm0,xmm1 + movd r5d, xmm0 + + add r0,r1 + lea r3,[r1+2*r1] ;ebx r3 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + lea r0, [r0+4*r1] + add r6, 64 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + lea r0, [r0+4*r1] + add r6, 64 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + lea r0, [r0+4*r1] + add r6, 64 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + sub r6, 192 + add r5d,10h + shr r5d,5 + movd xmm7,r5d + pxor xmm1,xmm1 + pshufb xmm7,xmm1 + pxor xmm4,xmm4 + pxor xmm3,xmm3 + pxor xmm2,xmm2 + ;sad begin + pop r3 + lea r4, [r3+2*r3] ;esi r4 + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r6, 64 + lea r2, [r2+4*r3] + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r6, 64 + lea r2, [r2+4*r3] + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r6, 64 + lea r2, [r2+4*r3] + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + + pop r4 + pop r5 + pslldq xmm3,4 + por xmm3,xmm2 + movhlps xmm1,xmm3 + paddw xmm3,xmm1 + movhlps xmm0,xmm4 + paddw xmm4,xmm0 + ; comparing order: DC H V + movd r1d, xmm4 ;DC ;ebx r1d + movd r0d, xmm3 ;V ;ecx r0d psrldq xmm3, 4 - movd esi, xmm3 ;H - mov eax, [esp+36] ;lamda - shl eax, 1 - add esi, eax - add ebx, eax - mov edx, [esp+32] - cmp ebx, esi + movd r2d, xmm3 ;H ;esi r2d + + ;mov eax, [esp+36] ;lamda ;eax r5 + shl r5d, 1 + add r2d, r5d + add r1d, r5d + ;mov edx, [esp+32] ;edx r4 + cmp r1d, r2d jge near not_dc_16x16_sad - cmp ebx, ecx + cmp r1d, r0d jge near not_dc_h_16x16_sad ; for DC mode - mov dword[edx], 2;I16_PRED_DC - mov eax, ebx - sub edi, 192 + mov dword[r4], 2;I16_PRED_DC + mov retrd, r1d + sub r6, 192 %assign x 0 %rep 16 - movdqa [edi+16*x], xmm7 + movdqa [r6+16*x], xmm7 %assign x x+1 %endrep jmp near return_sad_intra_16x16_x3 not_dc_16x16_sad: ; for H mode - cmp esi, ecx + cmp r2d, r0d jge near not_dc_h_16x16_sad - mov dword[edx], 1;I16_PRED_H - mov eax, esi + mov dword[r4], 1;I16_PRED_H + mov retrd, r2d jmp near return_sad_intra_16x16_x3 not_dc_h_16x16_sad: ; for V mode - mov dword[edx], 0;I16_PRED_V - mov eax, ecx - sub edi, 192 + mov dword[r4], 0;I16_PRED_V + mov retrd, r0d + sub r6, 192 %assign x 0 %rep 16 - movdqa [edi+16*x], xmm5 + movdqa [r6+16*x], xmm5 %assign x x+1 %endrep return_sad_intra_16x16_x3: - pop edi - pop esi - pop ebx + POP_XMM + LOAD_7_PARA_POP ret -%endif + ;*********************************************************************** ; ;Pixel_sad_intra_ssse3 END diff --git a/codec/encoder/core/inc/sample.h b/codec/encoder/core/inc/sample.h index b52e73b9a7..e68f5456d8 100644 --- a/codec/encoder/core/inc/sample.h +++ b/codec/encoder/core/inc/sample.h @@ -47,6 +47,11 @@ int32_t WelsSampleSatd8x8_c (uint8_t*, int32_t, uint8_t*, int32_t); //int32_t WelsSampleSatd4x8( uint8_t *, int32_t, uint8_t *, int32_t ); int32_t WelsSampleSatd4x4_c (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSatdIntra4x4Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t); +int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*); +int32_t WelsSampleSadIntra16x16Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*); +int32_t WelsSampleSatdIntra8x8Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, + uint8_t*, uint8_t*); #if defined(__cplusplus) extern "C" { diff --git a/codec/encoder/core/src/sample.cpp b/codec/encoder/core/src/sample.cpp index 4f3bda8b3f..bcc3465c85 100644 --- a/codec/encoder/core/src/sample.cpp +++ b/codec/encoder/core/src/sample.cpp @@ -367,11 +367,11 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2; pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2; pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2; - //pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2; + pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2; } if (uiCpuFlag & WELS_CPU_SSSE3) { - //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3; + pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3; } if (uiCpuFlag & WELS_CPU_SSE41) { @@ -380,8 +380,8 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41; pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41; pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41; - //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41; - //pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41; + pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41; + pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41; } #endif //(X86_ASM) diff --git a/codec/encoder/core/x86/intra_pred.asm b/codec/encoder/core/x86/intra_pred.asm index 2eb9faef49..12647a2b58 100644 --- a/codec/encoder/core/x86/intra_pred.asm +++ b/codec/encoder/core/x86/intra_pred.asm @@ -1166,251 +1166,4 @@ WELS_EXTERN WelsI16x16LumaPredDc_sse2 pop r4 pop r3 - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, -; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t); -; -;*********************************************************************** -%ifdef X86_32 -WELS_EXTERN WelsSampleSatdThree4x4_sse2 - push ebx - push esi - push edi - mov eax, [esp+24];p_enc - mov ebx, [esp+28];linesize_enc - - ; load source 4x4 samples and Hadamard transform - movd xmm0, [eax] - movd xmm1, [eax+ebx] - lea eax , [eax+2*ebx] - movd xmm2, [eax] - movd xmm3, [eax+ebx] - punpckldq xmm0, xmm2 - punpckldq xmm1, xmm3 - - pxor xmm6, xmm6 - punpcklbw xmm0, xmm6 - punpcklbw xmm1, xmm6 - - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - SSE2_XSawp qdq, xmm0, xmm2, xmm3 - - movdqa xmm4, xmm0 - paddw xmm0, xmm3 - psubw xmm4, xmm3 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm4 - punpckhwd xmm4, xmm2 - - SSE2_XSawp dq, xmm0, xmm4, xmm3 - SSE2_XSawp qdq, xmm0, xmm3, xmm5 - - movdqa xmm7, xmm0 - paddw xmm0, xmm5 - psubw xmm7, xmm5 - - SSE2_XSawp qdq, xmm0, xmm7, xmm1 - - ; Hadamard transform results are saved in xmm0 and xmm2 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - - ; load top boundary samples: [a b c d] - mov eax, [esp+16];p_dec - sub eax, [esp+20];linesize_dec - movzx ecx, byte [eax] - movzx edx, byte [eax+1] - movzx esi, byte [eax+2] - movzx edi, byte [eax+3] - - ; get the transform results of top boundary samples: [a b c d] - add edx, ecx ; edx = a + b - add edi, esi ; edi = c + d - add ecx, ecx ; ecx = a + a - add esi, esi ; esi = c + c - sub ecx, edx ; ecx = a + a - a - b = a - b - sub esi, edi ; esi = c + c - c - d = c - d - add edi, edx ; edi = (a + b) + (c + d) - add edx, edx - sub edx, edi ; edx = (a + b) - (c + d) - add esi, ecx ; esi = (a - b) + (c - d) - add ecx, ecx - sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi] - - movdqa xmm6, xmm0 - movdqa xmm7, xmm2 - movd xmm5, edi ; store the edi for DC mode - pxor xmm3, xmm3 - pxor xmm4, xmm4 - pinsrw xmm3, edi, 0 - pinsrw xmm3, esi, 4 - psllw xmm3, 2 - pinsrw xmm4, edx, 0 - pinsrw xmm4, ecx, 4 - psllw xmm4, 2 - - ; get the satd of H - psubw xmm0, xmm3 - psubw xmm2, xmm4 - - WELS_AbsW xmm0, xmm1 - WELS_AbsW xmm2, xmm1 - paddusw xmm0, xmm2 - SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0 - - ; load left boundary samples: [a b c d]' - mov eax, [esp+16] - mov ebx, [esp+20] - movzx ecx, byte [eax-1] - movzx edx, byte [eax+ebx-1] - lea eax , [eax+2*ebx] - movzx esi, byte [eax-1] - movzx edi, byte [eax+ebx-1] - - ; get the transform results of left boundary samples: [a b c d]' - add edx, ecx ; edx = a + b - add edi, esi ; edi = c + d - add ecx, ecx ; ecx = a + a - add esi, esi ; esi = c + c - sub ecx, edx ; ecx = a + a - a - b = a - b - sub esi, edi ; esi = c + c - c - d = c - d - add edi, edx ; edi = (a + b) + (c + d) - add edx, edx - sub edx, edi ; edx = (a + b) - (c + d) - add esi, ecx ; esi = (a - b) + (c - d) - add ecx, ecx - sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]' - - ; store the transform results in xmm3 - movd xmm3, edi - pinsrw xmm3, edx, 1 - pinsrw xmm3, ecx, 2 - pinsrw xmm3, esi, 3 - psllw xmm3, 2 - - ; get the satd of V - movdqa xmm2, xmm6 - movdqa xmm4, xmm7 - psubw xmm2, xmm3 - WELS_AbsW xmm2, xmm1 - WELS_AbsW xmm4, xmm1 - paddusw xmm2, xmm4 - SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2 - - ; DC result is stored in xmm1 - add edi, 4 - movd xmm1, edi - paddw xmm1, xmm5 - psrlw xmm1, 3 - movdqa xmm5, xmm1 - psllw xmm1, 4 - - ; get the satd of DC - psubw xmm6, xmm1 - WELS_AbsW xmm6, xmm1 - WELS_AbsW xmm7, xmm1 - paddusw xmm6, xmm7 - SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6 - - ; comparing order: DC H V - mov edx, [esp+32] - movd eax, xmm6 - movd edi, xmm2 - movd esi, xmm0 - and eax, 0xffff - shr eax, 1 - and edi, 0xffff - shr edi, 1 - and esi, 0xffff - shr esi, 1 - add eax, [esp+40] - add edi, [esp+44] - add esi, [esp+48] - cmp ax, di - jg near not_dc - cmp ax, si - jg near not_dc_h - - ; for DC mode - movd ebx, xmm5 - imul ebx, 0x01010101 - movd xmm5, ebx - pshufd xmm5, xmm5, 0 - movdqa [edx], xmm5 - mov ebx, [esp+36] - mov dword [ebx], 0x02 - pop edi - pop esi - pop ebx - ret - -not_dc: - cmp di, si - jg near not_dc_h - - ; for H mode - SSE_DB_1_2REG xmm6, xmm7 - mov eax, [esp+16] - mov ebx, [esp+20] - movzx ecx, byte [eax-1] - movd xmm0, ecx - pmuludq xmm0, xmm6 - - movzx ecx, byte [eax+ebx-1] - movd xmm1, ecx - pmuludq xmm1, xmm6 -%if 1 - punpckldq xmm0, xmm1 -%else - unpcklps xmm0, xmm1 -%endif - lea eax, [eax+ebx*2] - movzx ecx, byte [eax-1] - movd xmm2, ecx - pmuludq xmm2, xmm6 - - movzx ecx, byte [eax+ebx-1] - movd xmm3, ecx - pmuludq xmm3, xmm6 -%if 1 - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 -%else - unpcklps xmm2, xmm3 - unpcklpd xmm0, xmm2 -%endif - movdqa [edx],xmm0 - - mov eax, edi - mov ebx, [esp+36] - mov dword [ebx], 0x01 - - pop edi - pop esi - pop ebx - ret -not_dc_h: - ; for V mode - mov eax, [esp+16] - sub eax, [esp+20] - movd xmm0, [eax] - pshufd xmm0, xmm0, 0 - movdqa [edx],xmm0 - - mov eax, esi - mov ebx, [esp+36] - mov dword [ebx], 0x00 - - pop edi - pop esi - pop ebx - ret -%endif - + ret \ No newline at end of file diff --git a/test/encoder/EncUT_Sample.cpp b/test/encoder/EncUT_Sample.cpp index a7ba56545b..51cbc9ccb6 100644 --- a/test/encoder/EncUT_Sample.cpp +++ b/test/encoder/EncUT_Sample.cpp @@ -9,7 +9,136 @@ #include "sad_common.h" using namespace WelsSVCEnc; +#ifdef X86_ASM +TEST(IntraSadSatdFuncTest, WelsIntra16x16Combined3Sad_ssse3){ + const int32_t iLineSizeDec = 32; + const int32_t iLineSizeEnc = 32; + int32_t tmpa, tmpb; + int32_t iBestMode_c, iBestMode_a, iLambda = 50; + CMemoryAlign cMemoryAlign(0); + int32_t iCpuCores = 0; + uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores); + if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSSE3)) + return; + uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec"); + uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc"); + uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst"); + srand((uint32_t)time(NULL)); + for(int i=0;i<(iLineSizeDec<<5);i++) + pDec[i]=rand()%256; + for(int i=0;i<(iLineSizeEnc<<5);i++) + pEnc[i]=rand()%256; + + for(int i=0;i<512;i++) + pDst[i]=rand()%256; + tmpa = WelsSampleSadIntra16x16Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_c, iLambda, pDst); + tmpb = WelsIntra16x16Combined3Sad_ssse3(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_a, iLambda, pDst); + + ASSERT_EQ(tmpa, tmpb); + ASSERT_EQ(iBestMode_c, iBestMode_a); + + cMemoryAlign.WelsFree(pDec,"pDec"); + cMemoryAlign.WelsFree(pEnc,"pEnc"); + cMemoryAlign.WelsFree(pDst,"pDst"); +} +TEST(IntraSadSatdFuncTest, WelsIntra16x16Combined3Satd_sse41){ + const int32_t iLineSizeDec = 32; + const int32_t iLineSizeEnc = 32; + int32_t tmpa, tmpb; + int32_t iBestMode_c, iBestMode_a, iLambda = 50; + CMemoryAlign cMemoryAlign(0); + int32_t iCpuCores = 0; + uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores); + if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE41)) + return; + uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec"); + uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc"); + uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst"); + srand((uint32_t)time(NULL)); + for(int i=0;i<(iLineSizeDec<<5);i++) + pDec[i]=rand()%256; + for(int i=0;i<(iLineSizeEnc<<5);i++) + pEnc[i]=rand()%256; + for(int i=0;i<512;i++) + pDst[i]=rand()%256; + tmpa = WelsSampleSatdIntra16x16Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_c, iLambda, pDst); + tmpb = WelsIntra16x16Combined3Satd_sse41(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_a, iLambda, pDst); + ASSERT_EQ(tmpa, tmpb); + ASSERT_EQ(iBestMode_c, iBestMode_a); + cMemoryAlign.WelsFree(pDec,"pDec"); + cMemoryAlign.WelsFree(pEnc,"pEnc"); + cMemoryAlign.WelsFree(pDst,"pDst"); +} + +TEST(IntraSadSatdFuncTest, WelsSampleSatdThree4x4_sse2){ + const int32_t iLineSizeDec = 32; + const int32_t iLineSizeEnc = 32; + int32_t tmpa, tmpb; + int32_t iBestMode_c, iBestMode_a, iLambda = 50; + int32_t lambda[2] = {iLambda << 2, iLambda}; + int32_t iPredMode = rand()%3; + CMemoryAlign cMemoryAlign(0); + int32_t iCpuCores = 0; + uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores); + if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE2)) + return; + uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec"); + uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc"); + uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst"); + srand((uint32_t)time(NULL)); + for(int i=0;i<(iLineSizeDec<<5);i++) + pDec[i]=rand()%256; + for(int i=0;i<(iLineSizeEnc<<5);i++) + pEnc[i]=rand()%256; + for(int i=0;i<512;i++) + pDst[i]=rand()%256; + tmpa = WelsSampleSatdIntra4x4Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc, pDst, &iBestMode_c, lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]); + tmpb = WelsSampleSatdThree4x4_sse2(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc, pDst, &iBestMode_a, lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]); + ASSERT_EQ(tmpa, tmpb); + ASSERT_EQ(iBestMode_c, iBestMode_a); + cMemoryAlign.WelsFree(pDec,"pDec"); + cMemoryAlign.WelsFree(pEnc,"pEnc"); + cMemoryAlign.WelsFree(pDst,"pDst"); +} + +TEST(IntraSadSatdFuncTest, WelsIntraChroma8x8Combined3Satd_sse41){ + const int32_t iLineSizeDec = 32; + const int32_t iLineSizeEnc = 32; + int32_t tmpa, tmpb; + int32_t iBestMode_c, iBestMode_a, iLambda = 50; + CMemoryAlign cMemoryAlign(0); + int32_t iCpuCores = 0; + uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores); + if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE41)) + return; + uint8_t* pDecCb = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDecCb"); + uint8_t* pEncCb = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEncCb"); + uint8_t* pDecCr = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDecCr"); + uint8_t* pEncCr = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEncCr"); + uint8_t* pDstChma = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDstChma"); + srand((uint32_t)time(NULL)); + for(int i=0;i<(iLineSizeDec<<5);i++){ + pDecCb[i]=rand()%256; + pDecCr[i]=rand()%256; + } + for(int i=0;i<(iLineSizeEnc<<5);i++){ + pEncCb[i]=rand()%256; + pEncCr[i]=rand()%256; + } + for(int i=0;i<512;i++) + pDstChma[i]=rand()%256; + tmpa = WelsSampleSatdIntra8x8Combined3_c(pDecCb+128, iLineSizeDec, pEncCb,iLineSizeEnc,&iBestMode_c, iLambda, pDstChma, pDecCr+128, pEncCr); + tmpb = WelsIntraChroma8x8Combined3Satd_sse41(pDecCb+128, iLineSizeDec, pEncCb,iLineSizeEnc,&iBestMode_a, iLambda, pDstChma, pDecCr+128, pEncCr); + ASSERT_EQ(tmpa, tmpb); + ASSERT_EQ(iBestMode_c, iBestMode_a); + cMemoryAlign.WelsFree(pDecCb,"pDecCb"); + cMemoryAlign.WelsFree(pEncCb,"pEncCb"); + cMemoryAlign.WelsFree(pDecCr,"pDecCr"); + cMemoryAlign.WelsFree(pEncCr,"pEncCr"); + cMemoryAlign.WelsFree(pDstChma,"pDstChma"); +} +#endif #define ASSERT_MEMORY_FAIL2X(A, B) \ if (NULL == B) { \ pMemAlign->WelsFree(A, "Sad_SrcA");\ From fe5b8d1a696437f2b29eafd44789e09a43ccff88 Mon Sep 17 00:00:00 2001 From: Licai Guo Date: Sun, 4 May 2014 14:51:05 +0800 Subject: [PATCH 2/2] refine format --- codec/common/x86/satd_sad.asm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codec/common/x86/satd_sad.asm b/codec/common/x86/satd_sad.asm index 5ef5a8fe11..d2f178bad4 100644 --- a/codec/common/x86/satd_sad.asm +++ b/codec/common/x86/satd_sad.asm @@ -418,7 +418,7 @@ WELS_EXTERN WelsSampleSatdThree4x4_sse2 SSE2_XSawp qdq, xmm0, xmm7, xmm1 - ; Hadamard transform results are saved in xmm0 and xmm2 + ; Hadamard transform results are saved in xmm0 and xmm2 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 @@ -611,7 +611,7 @@ not_dc: %endif ret not_dc_h: - sub r0, r1 + sub r0, r1 sub r0, r1 sub r0, r1 movd xmm0, [r0]