Permalink
Browse files

Implement an sse version of scalarproduct_float().

git-svn-id: svn://svn.ffmpeg.org/ffmpeg/trunk/libavcodec@21386 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
  • Loading branch information...
1 parent 967a846 commit d41a6f111fe73bdeb00acdd68fa68f7e0587d9dd alexc committed Jan 22, 2010
Showing with 29 additions and 0 deletions.
  1. +5 −0 x86/dsputil_mmx.c
  2. +24 −0 x86/dsputil_yasm.asm
View
@@ -2510,6 +2510,8 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -2965,6 +2967,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->vector_clipf = vector_clipf_sse;
c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
+#if HAVE_YASM
+ c->scalarproduct_float = ff_scalarproduct_float_sse;
+#endif
}
if(mm_flags & FF_MM_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
View
@@ -397,3 +397,27 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
.unaligned:
ADD_HFYU_LEFT_LOOP 0
+
+; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
+cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
+ neg offsetq
+ shl offsetq, 2
+ sub v1q, offsetq
+ sub v2q, offsetq
+ xorps xmm0, xmm0
+ .loop:
+ movaps xmm1, [v1q+offsetq]
+ mulps xmm1, [v2q+offsetq]
+ addps xmm0, xmm1
+ add offsetq, 16
+ js .loop
+ movhlps xmm1, xmm0
+ addps xmm0, xmm1
+ movss xmm1, xmm0
+ shufps xmm0, xmm0, 1
+ addss xmm0, xmm1
+%ifndef ARCH_X86_64
+ movd r0m, xmm0
+ fld dword r0m
+%endif
+ RET

0 comments on commit d41a6f1

Please sign in to comment.