Skip to content

Commit

Permalink
Add more optimised SAD functions
Browse files Browse the repository at this point in the history
Inspired by the other SAD functions from x264.
  • Loading branch information
cantabile committed Oct 7, 2014
1 parent 772ffbe commit 868dc44
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 7 deletions.
3 changes: 3 additions & 0 deletions Makefile.am
Expand Up @@ -12,6 +12,7 @@ BUILT_SOURCES = src/asm/const-a.o \
src/asm/Interpolation.o \
src/asm/MVDegrains.o \
src/asm/Overlap.o \
src/asm/SAD.o \
src/asm/Variance.o
CLEANFILES = $(BUILT_SOURCES)

Expand Down Expand Up @@ -59,6 +60,7 @@ libmvtools_la_SOURCES = src/asm-placeholder.cpp \
src/asm/Interpolation.asm \
src/asm/MVDegrains.asm \
src/asm/Overlap.asm \
src/asm/SAD.asm \
src/asm/Variance.asm

libmvtools_la_LDFLAGS = -no-undefined -avoid-version \
Expand All @@ -70,6 +72,7 @@ libmvtools_la_LDFLAGS = -no-undefined -avoid-version \
-Xlinker src/asm/Interpolation.o \
-Xlinker src/asm/MVDegrains.o \
-Xlinker src/asm/Overlap.o \
-Xlinker src/asm/SAD.o \
-Xlinker src/asm/Variance.o

libmvtools_la_DEPENDENCIES = $(BUILT_SOURCES)
14 changes: 7 additions & 7 deletions src/PlaneOfBlocks.cpp
Expand Up @@ -90,7 +90,7 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi

if (yRatioUV == 2) {
BLITCHROMA = mvtools_Copy4x2_sse2;
SADCHROMA = Sad_C<4, 2>;
SADCHROMA = mvtools_sad_4x2_sse2;
} else {
BLITCHROMA = mvtools_Copy4x4_sse2;
SADCHROMA = mvtools_pixel_sad_4x4_mmx2;
Expand All @@ -113,15 +113,15 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi
}
} else if (nBlkSizeX == 16) {
if (nBlkSizeY == 2) {
SAD = Sad_C<16, 2>;
SAD = mvtools_sad_16x2_sse2;
LUMA = mvtools_Luma16x2_sse2;
BLITLUMA = mvtools_Copy16x2_sse2;
if (yRatioUV == 2) {
BLITCHROMA = mvtools_Copy8x1_sse2;
SADCHROMA = Sad_C<8, 1>;
SADCHROMA = mvtools_sad_8x1_sse2;
} else {
BLITCHROMA = mvtools_Copy8x2_sse2;
SADCHROMA = Sad_C<8, 2>;
SADCHROMA = mvtools_sad_8x2_sse2;
}
} else if (nBlkSizeY == 8) {
SAD = mvtools_pixel_sad_16x8_sse2;
Expand Down Expand Up @@ -170,7 +170,7 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi
}
} else if (nBlkSizeX == 32) {
if (nBlkSizeY == 16) {
SAD = Sad_C<32, 16>;
SAD = mvtools_sad_32x16_sse2;
LUMA = mvtools_Luma32x16_sse2;
BLITLUMA = mvtools_Copy32x16_sse2;

Expand All @@ -192,7 +192,7 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi
SADCHROMA = mvtools_pixel_sad_16x16_cache64_ssse3;
}
} else if (nBlkSizeY == 32) {
SAD = Sad_C<32, 32>;
SAD = mvtools_sad_32x32_sse2;
LUMA = mvtools_Luma32x32_sse2;
BLITLUMA = mvtools_Copy32x32_sse2;

Expand All @@ -206,7 +206,7 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi
SADCHROMA = mvtools_pixel_sad_16x16_cache64_ssse3;
} else {
BLITCHROMA = mvtools_Copy16x32_sse2;
SADCHROMA = Sad_C<16, 32>;
SADCHROMA = mvtools_sad_16x32_sse2;
}
}
}
Expand Down
9 changes: 9 additions & 0 deletions src/SADFunctions.h
Expand Up @@ -94,6 +94,15 @@ inline unsigned int Sad2x4_C(const uint8_t *pSrc, const uint8_t *pRef,int nSrcPi
*/
#define MK_CFUNC(functionname) extern "C" unsigned int functionname (const uint8_t *pSrc, intptr_t nSrcPitch, const uint8_t *pRef, intptr_t nRefPitch)

// From SAD.asm
MK_CFUNC(mvtools_sad_4x2_sse2);
MK_CFUNC(mvtools_sad_8x1_sse2);
MK_CFUNC(mvtools_sad_8x2_sse2);
MK_CFUNC(mvtools_sad_16x2_sse2);
MK_CFUNC(mvtools_sad_16x32_sse2);
MK_CFUNC(mvtools_sad_32x16_sse2);
MK_CFUNC(mvtools_sad_32x32_sse2);

/* included from x264 */
#define SAD_x264(blsizex, blsizey) extern "C" unsigned int mvtools_pixel_sad_##blsizex##x##blsizey##_mmx2(const uint8_t *pSrc, intptr_t nSrcPitch, const uint8_t *pRef, intptr_t nRefPitch)
//mvtools_pixel_sad_16x16_mmx2( x,y can be: 16 8 4
Expand Down
190 changes: 190 additions & 0 deletions src/asm/SAD.asm
@@ -0,0 +1,190 @@
%include "include/x86inc.asm"


SECTION_TEXT


INIT_XMM
cglobal sad_4x2_sse2, 4, 4, 4, srcp1, stride1, srcp2, stride2
movd m0, [srcp1q]
movd m1, [srcp2q]
movd m2, [srcp1q + stride1q]
movd m3, [srcp2q + stride2q]
punpckldq m0, m2
punpckldq m1, m3

psadbw m0, m1

movd eax, m0

RET


INIT_XMM
cglobal sad_8x1_sse2, 4, 4, 2, srcp1, stride1, srcp2, stride2
movq m0, [srcp1q]
movq m1, [srcp2q]

psadbw m0, m1

movd eax, m0

RET


INIT_XMM
cglobal sad_8x2_sse2, 4, 4, 2, srcp1, stride1, srcp2, stride2
movq m0, [srcp1q]
movhps m0, [srcp1q + stride1q]
movq m1, [srcp2q]
movhps m1, [srcp2q + stride2q]

psadbw m0, m1

movhlps m1, m0
paddw m0, m1
movd eax, m0

RET


INIT_XMM
cglobal sad_16x2_sse2, 4, 4, 4, srcp1, stride1, srcp2, stride2
movdqu m0, [srcp1q]
movdqu m1, [srcp1q + stride1q]
movdqu m2, [srcp2q]
movdqu m3, [srcp2q + stride2q]

psadbw m0, m2
psadbw m1, m3

paddw m0, m1

movhlps m1, m0
paddw m0, m1
movd eax, m0

RET


%macro SAD16x2 0
movdqu m1, [srcp1q]
movdqu m2, [srcp1q + stride1q]
movdqu m3, [srcp2q]
movdqu m4, [srcp2q + stride2q]
psadbw m1, m3
psadbw m2, m4
lea srcp1q, [srcp1q + stride1q * 2]
paddd m0, m1
paddd m0, m2
lea srcp2q, [srcp2q + stride2q * 2]
%endmacro


INIT_XMM
cglobal sad_16x32_sse2, 4, 4, 5, srcp1, stride1, srcp2, stride2
pxor m0, m0

SAD16x2
SAD16x2
SAD16x2
SAD16x2

SAD16x2
SAD16x2
SAD16x2
SAD16x2

SAD16x2
SAD16x2
SAD16x2
SAD16x2

SAD16x2
SAD16x2
SAD16x2
SAD16x2

movhlps m1, m0
paddd m0, m1
movd eax, m0

RET


%macro SAD32x2 0
movdqu m1, [srcp1q]
movdqu m2, [srcp1q + 16]
movdqu m3, [srcp2q]
movdqu m4, [srcp2q + 16]

psadbw m1, m3
psadbw m2, m4

movdqu m3, [srcp1q + stride1q]
movdqu m4, [srcp1q + stride1q + 16]
movdqu m5, [srcp2q + stride2q]
movdqu m6, [srcp2q + stride2q + 16]

psadbw m3, m5
psadbw m4, m6

lea srcp1q, [srcp1q + stride1q * 2]
paddw m1, m2
paddw m3, m4
lea srcp2q, [srcp2q + stride2q * 2]
paddd m0, m1
paddd m0, m3
%endmacro


INIT_XMM
cglobal sad_32x16_sse2, 4, 4, 7, srcp1, stride1, srcp2, stride2
pxor m0, m0

SAD32x2
SAD32x2
SAD32x2
SAD32x2

SAD32x2
SAD32x2
SAD32x2
SAD32x2

movhlps m1, m0
paddd m0, m1
movd eax, m0

RET


INIT_XMM
cglobal sad_32x32_sse2, 4, 4, 7, srcp1, stride1, srcp2, stride2
pxor m0, m0

SAD32x2
SAD32x2
SAD32x2
SAD32x2

SAD32x2
SAD32x2
SAD32x2
SAD32x2

SAD32x2
SAD32x2
SAD32x2
SAD32x2

SAD32x2
SAD32x2
SAD32x2
SAD32x2

movhlps m1, m0
paddd m0, m1
movd eax, m0

RET

0 comments on commit 868dc44

Please sign in to comment.