From ec0ed97b046d46421db72c4911d2bbe28bbe5741 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 22 Mar 2012 17:25:22 -0700 Subject: [PATCH 01/12] utvideo: port header reading to bytestream2. Fixes crash during slice size reading if slice_end goes negative. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind CC: libav-stable@libav.org --- libavcodec/utvideo.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/libavcodec/utvideo.c b/libavcodec/utvideo.c index ebde38de17..95cea5f423 100644 --- a/libavcodec/utvideo.c +++ b/libavcodec/utvideo.c @@ -358,13 +358,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac { const uint8_t *buf = avpkt->data; int buf_size = avpkt->size; - const uint8_t *buf_end = buf + buf_size; UtvideoContext *c = avctx->priv_data; - const uint8_t *ptr; int i, j; const uint8_t *plane_start[5]; int plane_size, max_slice_size = 0, slice_start, slice_end, slice_size; int ret; + GetByteContext gb; if (c->pic.data[0]) ff_thread_release_buffer(avctx, &c->pic); @@ -379,20 +378,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac ff_thread_finish_setup(avctx); /* parse plane structure to retrieve frame flags and validate slice offsets */ - ptr = buf; + bytestream2_init(&gb, buf, buf_size); for (i = 0; i < c->planes; i++) { - plane_start[i] = ptr; - if (buf_end - ptr < 256 + 4 * c->slices) { + plane_start[i] = gb.buffer; + if (bytestream2_get_bytes_left(&gb) < 256 + 4 * c->slices) { av_log(avctx, AV_LOG_ERROR, "Insufficient data for a plane\n"); return AVERROR_INVALIDDATA; } - ptr += 256; + bytestream2_skipu(&gb, 256); slice_start = 0; slice_end = 0; for (j = 0; j < c->slices; j++) { - slice_end = bytestream_get_le32(&ptr); + slice_end = bytestream2_get_le32u(&gb); slice_size = slice_end - slice_start; - if (slice_size < 0) { + if (slice_end <= 0 || slice_size <= 0 || + bytestream2_get_bytes_left(&gb) < slice_end) { av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n"); return AVERROR_INVALIDDATA; } @@ -400,18 +400,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac max_slice_size = FFMAX(max_slice_size, slice_size); } plane_size = slice_end; - if (buf_end - ptr < plane_size) { - av_log(avctx, AV_LOG_ERROR, "Plane size is bigger than available data\n"); - return AVERROR_INVALIDDATA; - } - ptr += plane_size; + bytestream2_skipu(&gb, plane_size); } - plane_start[c->planes] = ptr; - if (buf_end - ptr < c->frame_info_size) { + plane_start[c->planes] = gb.buffer; + if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) { av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n"); return AVERROR_INVALIDDATA; } - c->frame_info = AV_RL32(ptr); + c->frame_info = bytestream2_get_le32u(&gb); av_log(avctx, AV_LOG_DEBUG, "frame information flags %X\n", c->frame_info); c->frame_pred = (c->frame_info >> 8) & 3; From 6ef4063957aa5025c8d2cd757b6a537e4b6874df Mon Sep 17 00:00:00 2001 From: Alexander Strange Date: Sat, 24 Mar 2012 17:32:14 -0400 Subject: [PATCH 02/12] h264: Add check for invalid chroma_format_idc Fixes a crash when FF_DEBUG_PICT_INFO is used. Signed-off-by: Ronald S. Bultje --- libavcodec/h264_ps.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c index 276eb77d1d..05f04afa57 100644 --- a/libavcodec/h264_ps.c +++ b/libavcodec/h264_ps.c @@ -332,8 +332,12 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ if(sps->profile_idc >= 100){ //high profile sps->chroma_format_idc= get_ue_golomb_31(&s->gb); - if(sps->chroma_format_idc == 3) + if(sps->chroma_format_idc > 3) { + av_log(h->s.avctx, AV_LOG_ERROR, "chroma_format_idc (%u) out of range\n", sps->chroma_format_idc); + return -1; + } else if(sps->chroma_format_idc == 3) { sps->residual_color_transform_flag = get_bits1(&s->gb); + } sps->bit_depth_luma = get_ue_golomb(&s->gb) + 8; sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8; sps->transform_bypass = get_bits1(&s->gb); From 147ee4cf065a20bbba10292b496a037e6573cd6e Mon Sep 17 00:00:00 2001 From: Alexander Strange Date: Sat, 24 Mar 2012 18:25:15 -0400 Subject: [PATCH 03/12] pthread: Immediately release all frames in ff_thread_flush() Before this, they were only added to the delayed release queue and not freed until later. This could lead to unnecessary memory use or buffer exhaustion. Signed-off-by: Ronald S. Bultje --- libavcodec/pthread.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c index 2a11195e78..ee7bdb5310 100644 --- a/libavcodec/pthread.c +++ b/libavcodec/pthread.c @@ -880,9 +880,13 @@ void ff_thread_flush(AVCodecContext *avctx) fctx->next_decoding = fctx->next_finished = 0; fctx->delaying = 1; fctx->prev_thread = NULL; - // Make sure decode flush calls with size=0 won't return old frames - for (int i = 0; i < avctx->thread_count; i++) - fctx->threads[i].got_frame = 0; + for (int i = 0; i < avctx->thread_count; i++) { + PerThreadContext *p = &fctx->threads[i]; + // Make sure decode flush calls with size=0 won't return old frames + p->got_frame = 0; + + release_delayed_buffers(p); + } } static int *allocate_progress(PerThreadContext *p) From cb3486778044e580ef38a8d861af4a2e42336866 Mon Sep 17 00:00:00 2001 From: Alexander Strange Date: Sat, 24 Mar 2012 15:34:22 -0700 Subject: [PATCH 04/12] h264: fix memleak in error path. Signed-off-by: Ronald S. Bultje --- libavcodec/h264_ps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c index 05f04afa57..354469c9ad 100644 --- a/libavcodec/h264_ps.c +++ b/libavcodec/h264_ps.c @@ -334,7 +334,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ sps->chroma_format_idc= get_ue_golomb_31(&s->gb); if(sps->chroma_format_idc > 3) { av_log(h->s.avctx, AV_LOG_ERROR, "chroma_format_idc (%u) out of range\n", sps->chroma_format_idc); - return -1; + goto fail; } else if(sps->chroma_format_idc == 3) { sps->residual_color_transform_flag = get_bits1(&s->gb); } From 68893afe1d4583038d3788b6c3e462e42ce6074d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 22 Mar 2012 13:31:31 +0200 Subject: [PATCH 05/12] movenc: Merge if statements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't exactly equivalent with the earlier code for codecs other than H264 and VC1, but those are two only codecs supported by this codepath anyway, and it simplifies it a bit. Signed-off-by: Martin Storsjö --- libavformat/movenc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libavformat/movenc.c b/libavformat/movenc.c index a832c1d102..1d808f128f 100644 --- a/libavformat/movenc.c +++ b/libavformat/movenc.c @@ -2147,14 +2147,11 @@ static int mov_write_isml_manifest(AVIOContext *pb, MOVMuxContext *mov) size); av_free(ptr); } - } else { - param_write_hex(pb, "CodecPrivateData", track->enc->extradata, - track->enc->extradata_size); - } - if (track->enc->codec_id == CODEC_ID_H264) { param_write_string(pb, "FourCC", "H264"); } else if (track->enc->codec_id == CODEC_ID_VC1) { param_write_string(pb, "FourCC", "WVC1"); + param_write_hex(pb, "CodecPrivateData", track->enc->extradata, + track->enc->extradata_size); } param_write_int(pb, "MaxWidth", track->enc->width); param_write_int(pb, "MaxHeight", track->enc->height); From f31a68e78c129a1ed20c73c3a6a774aff8275930 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 24 Mar 2012 17:05:05 -0700 Subject: [PATCH 06/12] interplayvideo: convert to bytestream2. --- libavcodec/interplayvideo.c | 323 ++++++++++++++---------------------- 1 file changed, 124 insertions(+), 199 deletions(-) diff --git a/libavcodec/interplayvideo.c b/libavcodec/interplayvideo.c index 4a250a8f19..256b6c65b1 100644 --- a/libavcodec/interplayvideo.c +++ b/libavcodec/interplayvideo.c @@ -56,14 +56,8 @@ typedef struct IpvideoContext { const unsigned char *decoding_map; int decoding_map_size; - const unsigned char *buf; - int size; - int is_16bpp; - const unsigned char *stream_ptr; - const unsigned char *stream_end; - const uint8_t *mv_ptr; - const uint8_t *mv_end; + GetByteContext stream_ptr, mv_ptr; unsigned char *pixel_ptr; int line_inc; int stride; @@ -72,13 +66,6 @@ typedef struct IpvideoContext { uint32_t pal[256]; } IpvideoContext; -#define CHECK_STREAM_PTR(stream_ptr, stream_end, n) \ - if (stream_end - stream_ptr < n) { \ - av_log(s->avctx, AV_LOG_ERROR, "Interplay video warning: stream_ptr out of bounds (%p >= %p)\n", \ - stream_ptr + n, stream_end); \ - return -1; \ - } - static int copy_from(IpvideoContext *s, AVFrame *src, int delta_x, int delta_y) { int current_offset = s->pixel_ptr - s->current_frame.data[0]; @@ -118,11 +105,9 @@ static int ipvideo_decode_block_opcode_0x2(IpvideoContext *s) /* copy block from 2 frames ago using a motion vector; need 1 more byte */ if (!s->is_16bpp) { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); - B = *s->stream_ptr++; + B = bytestream2_get_byte(&s->stream_ptr); } else { - CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1); - B = *s->mv_ptr++; + B = bytestream2_get_byte(&s->mv_ptr); } if (B < 56) { @@ -146,11 +131,9 @@ static int ipvideo_decode_block_opcode_0x3(IpvideoContext *s) /* need 1 more byte for motion */ if (!s->is_16bpp) { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); - B = *s->stream_ptr++; + B = bytestream2_get_byte(&s->stream_ptr); } else { - CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1); - B = *s->mv_ptr++; + B = bytestream2_get_byte(&s->mv_ptr); } if (B < 56) { @@ -172,11 +155,9 @@ static int ipvideo_decode_block_opcode_0x4(IpvideoContext *s) /* copy a block from the previous frame; need 1 more byte */ if (!s->is_16bpp) { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); - B = *s->stream_ptr++; + B = bytestream2_get_byte(&s->stream_ptr); } else { - CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1); - B = *s->mv_ptr++; + B = bytestream2_get_byte(&s->mv_ptr); } BL = B & 0x0F; @@ -194,10 +175,8 @@ static int ipvideo_decode_block_opcode_0x5(IpvideoContext *s) /* copy a block from the previous frame using an expanded range; * need 2 more bytes */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - - x = *s->stream_ptr++; - y = *s->stream_ptr++; + x = bytestream2_get_byte(&s->stream_ptr); + y = bytestream2_get_byte(&s->stream_ptr); av_dlog(NULL, " motion bytes = %d, %d\n", x, y); return copy_from(s, &s->last_frame, x, y); @@ -219,18 +198,14 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s) unsigned int flags; /* 2-color encoding */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - - P[0] = *s->stream_ptr++; - P[1] = *s->stream_ptr++; + P[0] = bytestream2_get_byte(&s->stream_ptr); + P[1] = bytestream2_get_byte(&s->stream_ptr); if (P[0] <= P[1]) { /* need 8 more bytes from the stream */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); - for (y = 0; y < 8; y++) { - flags = *s->stream_ptr++ | 0x100; + flags = bytestream2_get_byte(&s->stream_ptr) | 0x100; for (; flags != 1; flags >>= 1) *s->pixel_ptr++ = P[flags & 1]; s->pixel_ptr += s->line_inc; @@ -239,9 +214,7 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s) } else { /* need 2 more bytes from the stream */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - - flags = bytestream_get_le16(&s->stream_ptr); + flags = bytestream2_get_le16(&s->stream_ptr); for (y = 0; y < 8; y += 2) { for (x = 0; x < 8; x += 2, flags >>= 1) { s->pixel_ptr[x ] = @@ -260,26 +233,23 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s) static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) { int x, y; - unsigned char P[2]; + unsigned char P[4]; unsigned int flags = 0; /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on * either top and bottom or left and right halves */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - - P[0] = *s->stream_ptr++; - P[1] = *s->stream_ptr++; + P[0] = bytestream2_get_byte(&s->stream_ptr); + P[1] = bytestream2_get_byte(&s->stream_ptr); if (P[0] <= P[1]) { - - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 14); - s->stream_ptr -= 2; - for (y = 0; y < 16; y++) { // new values for each 4x4 block if (!(y & 3)) { - P[0] = *s->stream_ptr++; P[1] = *s->stream_ptr++; - flags = bytestream_get_le16(&s->stream_ptr); + if (y) { + P[0] = bytestream2_get_byte(&s->stream_ptr); + P[1] = bytestream2_get_byte(&s->stream_ptr); + } + flags = bytestream2_get_le16(&s->stream_ptr); } for (x = 0; x < 4; x++, flags >>= 1) @@ -290,13 +260,11 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) } } else { + flags = bytestream2_get_le32(&s->stream_ptr); + P[2] = bytestream2_get_byte(&s->stream_ptr); + P[3] = bytestream2_get_byte(&s->stream_ptr); - /* need 10 more bytes */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 10); - - if (s->stream_ptr[4] <= s->stream_ptr[5]) { - - flags = bytestream_get_le32(&s->stream_ptr); + if (P[2] <= P[3]) { /* vertical split; left & right halves are 2-color encoded */ @@ -307,8 +275,9 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) // switch to right half if (y == 7) { s->pixel_ptr -= 8 * s->stride - 4; - P[0] = *s->stream_ptr++; P[1] = *s->stream_ptr++; - flags = bytestream_get_le32(&s->stream_ptr); + P[0] = P[2]; + P[1] = P[3]; + flags = bytestream2_get_le32(&s->stream_ptr); } } @@ -318,12 +287,12 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) for (y = 0; y < 8; y++) { if (y == 4) { - P[0] = *s->stream_ptr++; - P[1] = *s->stream_ptr++; + P[0] = P[2]; + P[1] = P[3]; + flags = bytestream2_get_le32(&s->stream_ptr); } - flags = *s->stream_ptr++ | 0x100; - for (; flags != 1; flags >>= 1) + for (x = 0; x < 8; x++, flags >>= 1) *s->pixel_ptr++ = P[flags & 1]; s->pixel_ptr += s->line_inc; } @@ -340,20 +309,15 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) unsigned char P[4]; /* 4-color encoding */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); - - memcpy(P, s->stream_ptr, 4); - s->stream_ptr += 4; + bytestream2_get_buffer(&s->stream_ptr, P, 4); if (P[0] <= P[1]) { if (P[2] <= P[3]) { /* 1 of 4 colors for each pixel, need 16 more bytes */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16); - for (y = 0; y < 8; y++) { /* get the next set of 8 2-bit flags */ - int flags = bytestream_get_le16(&s->stream_ptr); + int flags = bytestream2_get_le16(&s->stream_ptr); for (x = 0; x < 8; x++, flags >>= 2) *s->pixel_ptr++ = P[flags & 0x03]; s->pixel_ptr += s->line_inc; @@ -363,9 +327,7 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) uint32_t flags; /* 1 of 4 colors for each 2x2 block, need 4 more bytes */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); - - flags = bytestream_get_le32(&s->stream_ptr); + flags = bytestream2_get_le32(&s->stream_ptr); for (y = 0; y < 8; y += 2) { for (x = 0; x < 8; x += 2, flags >>= 2) { @@ -382,9 +344,7 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) uint64_t flags; /* 1 of 4 colors for each 2x1 or 1x2 block, need 8 more bytes */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); - - flags = bytestream_get_le64(&s->stream_ptr); + flags = bytestream2_get_le64(&s->stream_ptr); if (P[2] <= P[3]) { for (y = 0; y < 8; y++) { for (x = 0; x < 8; x += 2, flags >>= 2) { @@ -411,24 +371,21 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s) { int x, y; - unsigned char P[4]; + unsigned char P[8]; int flags = 0; + bytestream2_get_buffer(&s->stream_ptr, P, 4); + /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on * either top and bottom or left and right halves */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24); - - if (s->stream_ptr[0] <= s->stream_ptr[1]) { + if (P[0] <= P[1]) { /* 4-color encoding for each quadrant; need 32 bytes */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 32); - for (y = 0; y < 16; y++) { // new values for each 4x4 block if (!(y & 3)) { - memcpy(P, s->stream_ptr, 4); - s->stream_ptr += 4; - flags = bytestream_get_le32(&s->stream_ptr); + if (y) bytestream2_get_buffer(&s->stream_ptr, P, 4); + flags = bytestream2_get_le32(&s->stream_ptr); } for (x = 0; x < 4; x++, flags >>= 2) @@ -441,20 +398,16 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s) } else { // vertical split? - int vert = s->stream_ptr[12] <= s->stream_ptr[13]; - uint64_t flags = 0; + int vert; + uint64_t flags = bytestream2_get_le64(&s->stream_ptr); + + bytestream2_get_buffer(&s->stream_ptr, P + 4, 4); + vert = P[4] <= P[5]; /* 4-color encoding for either left and right or top and bottom * halves */ for (y = 0; y < 16; y++) { - // load values for each half - if (!(y & 7)) { - memcpy(P, s->stream_ptr, 4); - s->stream_ptr += 4; - flags = bytestream_get_le64(&s->stream_ptr); - } - for (x = 0; x < 4; x++, flags >>= 2) *s->pixel_ptr++ = P[flags & 0x03]; @@ -463,6 +416,12 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s) // switch to right half if (y == 7) s->pixel_ptr -= 8 * s->stride - 4; } else if (y & 1) s->pixel_ptr += s->line_inc; + + // load values for second half + if (y == 7) { + memcpy(P, P + 4, 4); + flags = bytestream2_get_le64(&s->stream_ptr); + } } } @@ -475,11 +434,8 @@ static int ipvideo_decode_block_opcode_0xB(IpvideoContext *s) int y; /* 64-color encoding (each pixel in block is a different color) */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 64); - for (y = 0; y < 8; y++) { - memcpy(s->pixel_ptr, s->stream_ptr, 8); - s->stream_ptr += 8; + bytestream2_get_buffer(&s->stream_ptr, s->pixel_ptr, 8); s->pixel_ptr += s->stride; } @@ -492,14 +448,12 @@ static int ipvideo_decode_block_opcode_0xC(IpvideoContext *s) int x, y; /* 16-color block encoding: each 2x2 block is a different color */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16); - for (y = 0; y < 8; y += 2) { for (x = 0; x < 8; x += 2) { s->pixel_ptr[x ] = s->pixel_ptr[x + 1 ] = s->pixel_ptr[x + s->stride] = - s->pixel_ptr[x + 1 + s->stride] = *s->stream_ptr++; + s->pixel_ptr[x + 1 + s->stride] = bytestream2_get_byte(&s->stream_ptr); } s->pixel_ptr += s->stride * 2; } @@ -514,12 +468,10 @@ static int ipvideo_decode_block_opcode_0xD(IpvideoContext *s) unsigned char P[2]; /* 4-color block encoding: each 4x4 block is a different color */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); - for (y = 0; y < 8; y++) { if (!(y & 3)) { - P[0] = *s->stream_ptr++; - P[1] = *s->stream_ptr++; + P[0] = bytestream2_get_byte(&s->stream_ptr); + P[1] = bytestream2_get_byte(&s->stream_ptr); } memset(s->pixel_ptr, P[0], 4); memset(s->pixel_ptr + 4, P[1], 4); @@ -536,8 +488,7 @@ static int ipvideo_decode_block_opcode_0xE(IpvideoContext *s) unsigned char pix; /* 1-color encoding: the whole block is 1 solid color */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); - pix = *s->stream_ptr++; + pix = bytestream2_get_byte(&s->stream_ptr); for (y = 0; y < 8; y++) { memset(s->pixel_ptr, pix, 8); @@ -554,9 +505,8 @@ static int ipvideo_decode_block_opcode_0xF(IpvideoContext *s) unsigned char sample[2]; /* dithered encoding */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - sample[0] = *s->stream_ptr++; - sample[1] = *s->stream_ptr++; + sample[0] = bytestream2_get_byte(&s->stream_ptr); + sample[1] = bytestream2_get_byte(&s->stream_ptr); for (y = 0; y < 8; y++) { for (x = 0; x < 8; x += 2) { @@ -575,10 +525,8 @@ static int ipvideo_decode_block_opcode_0x6_16(IpvideoContext *s) signed char x, y; /* copy a block from the second last frame using an expanded range */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - - x = *s->stream_ptr++; - y = *s->stream_ptr++; + x = bytestream2_get_byte(&s->stream_ptr); + y = bytestream2_get_byte(&s->stream_ptr); av_dlog(NULL, " motion bytes = %d, %d\n", x, y); return copy_from(s, &s->second_last_frame, x, y); @@ -592,17 +540,13 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s) uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 2-color encoding */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); - - P[0] = bytestream_get_le16(&s->stream_ptr); - P[1] = bytestream_get_le16(&s->stream_ptr); + P[0] = bytestream2_get_le16(&s->stream_ptr); + P[1] = bytestream2_get_le16(&s->stream_ptr); if (!(P[0] & 0x8000)) { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); - for (y = 0; y < 8; y++) { - flags = *s->stream_ptr++ | 0x100; + flags = bytestream2_get_byte(&s->stream_ptr) | 0x100; for (; flags != 1; flags >>= 1) *pixel_ptr++ = P[flags & 1]; pixel_ptr += s->line_inc; @@ -610,9 +554,7 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s) } else { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - - flags = bytestream_get_le16(&s->stream_ptr); + flags = bytestream2_get_le16(&s->stream_ptr); for (y = 0; y < 8; y += 2) { for (x = 0; x < 8; x += 2, flags >>= 1) { pixel_ptr[x ] = @@ -630,28 +572,25 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s) static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) { int x, y; - uint16_t P[2]; + uint16_t P[4]; unsigned int flags = 0; uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on * either top and bottom or left and right halves */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); - - P[0] = bytestream_get_le16(&s->stream_ptr); - P[1] = bytestream_get_le16(&s->stream_ptr); + P[0] = bytestream2_get_le16(&s->stream_ptr); + P[1] = bytestream2_get_le16(&s->stream_ptr); if (!(P[0] & 0x8000)) { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24); - s->stream_ptr -= 4; - for (y = 0; y < 16; y++) { // new values for each 4x4 block if (!(y & 3)) { - P[0] = bytestream_get_le16(&s->stream_ptr); - P[1] = bytestream_get_le16(&s->stream_ptr); - flags = bytestream_get_le16(&s->stream_ptr); + if (y) { + P[0] = bytestream2_get_le16(&s->stream_ptr); + P[1] = bytestream2_get_le16(&s->stream_ptr); + } + flags = bytestream2_get_le16(&s->stream_ptr); } for (x = 0; x < 4; x++, flags >>= 1) @@ -663,11 +602,11 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) } else { - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 12); - - if (!(AV_RL16(s->stream_ptr + 4) & 0x8000)) { + flags = bytestream2_get_le32(&s->stream_ptr); + P[2] = bytestream2_get_le16(&s->stream_ptr); + P[3] = bytestream2_get_le16(&s->stream_ptr); - flags = bytestream_get_le32(&s->stream_ptr); + if (!(P[2] & 0x8000)) { /* vertical split; left & right halves are 2-color encoded */ @@ -678,9 +617,9 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) // switch to right half if (y == 7) { pixel_ptr -= 8 * s->stride - 4; - P[0] = bytestream_get_le16(&s->stream_ptr); - P[1] = bytestream_get_le16(&s->stream_ptr); - flags = bytestream_get_le32(&s->stream_ptr); + P[0] = P[2]; + P[1] = P[3]; + flags = bytestream2_get_le32(&s->stream_ptr); } } @@ -690,12 +629,12 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) for (y = 0; y < 8; y++) { if (y == 4) { - P[0] = bytestream_get_le16(&s->stream_ptr); - P[1] = bytestream_get_le16(&s->stream_ptr); + P[0] = P[2]; + P[1] = P[3]; + flags = bytestream2_get_le32(&s->stream_ptr); } - flags = *s->stream_ptr++ | 0x100; - for (; flags != 1; flags >>= 1) + for (x = 0; x < 8; x++, flags >>= 1) *pixel_ptr++ = P[flags & 1]; pixel_ptr += s->line_inc; } @@ -713,20 +652,16 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 4-color encoding */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); - for (x = 0; x < 4; x++) - P[x] = bytestream_get_le16(&s->stream_ptr); + P[x] = bytestream2_get_le16(&s->stream_ptr); if (!(P[0] & 0x8000)) { if (!(P[2] & 0x8000)) { /* 1 of 4 colors for each pixel */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16); - for (y = 0; y < 8; y++) { /* get the next set of 8 2-bit flags */ - int flags = bytestream_get_le16(&s->stream_ptr); + int flags = bytestream2_get_le16(&s->stream_ptr); for (x = 0; x < 8; x++, flags >>= 2) *pixel_ptr++ = P[flags & 0x03]; pixel_ptr += s->line_inc; @@ -736,9 +671,7 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) uint32_t flags; /* 1 of 4 colors for each 2x2 block */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); - - flags = bytestream_get_le32(&s->stream_ptr); + flags = bytestream2_get_le32(&s->stream_ptr); for (y = 0; y < 8; y += 2) { for (x = 0; x < 8; x += 2, flags >>= 2) { @@ -755,9 +688,7 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) uint64_t flags; /* 1 of 4 colors for each 2x1 or 1x2 block */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); - - flags = bytestream_get_le64(&s->stream_ptr); + flags = bytestream2_get_le64(&s->stream_ptr); if (!(P[2] & 0x8000)) { for (y = 0; y < 8; y++) { for (x = 0; x < 8; x += 2, flags >>= 2) { @@ -784,25 +715,25 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s) { int x, y; - uint16_t P[4]; + uint16_t P[8]; int flags = 0; uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; + for (x = 0; x < 4; x++) + P[x] = bytestream2_get_le16(&s->stream_ptr); + /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on * either top and bottom or left and right halves */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24); - - if (!(AV_RL16(s->stream_ptr) & 0x8000)) { + if (!(P[0] & 0x8000)) { /* 4-color encoding for each quadrant */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 48); - for (y = 0; y < 16; y++) { // new values for each 4x4 block if (!(y & 3)) { - for (x = 0; x < 4; x++) - P[x] = bytestream_get_le16(&s->stream_ptr); - flags = bytestream_get_le32(&s->stream_ptr); + if (y) + for (x = 0; x < 4; x++) + P[x] = bytestream2_get_le16(&s->stream_ptr); + flags = bytestream2_get_le32(&s->stream_ptr); } for (x = 0; x < 4; x++, flags >>= 2) @@ -815,20 +746,17 @@ static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s) } else { // vertical split? - int vert = !(AV_RL16(s->stream_ptr + 16) & 0x8000); - uint64_t flags = 0; + int vert; + uint64_t flags = bytestream2_get_le64(&s->stream_ptr); + + for (x = 4; x < 8; x++) + P[x] = bytestream2_get_le16(&s->stream_ptr); + vert = !(P[4] & 0x8000); /* 4-color encoding for either left and right or top and bottom * halves */ for (y = 0; y < 16; y++) { - // load values for each half - if (!(y & 7)) { - for (x = 0; x < 4; x++) - P[x] = bytestream_get_le16(&s->stream_ptr); - flags = bytestream_get_le64(&s->stream_ptr); - } - for (x = 0; x < 4; x++, flags >>= 2) *pixel_ptr++ = P[flags & 0x03]; @@ -837,6 +765,12 @@ static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s) // switch to right half if (y == 7) pixel_ptr -= 8 * s->stride - 4; } else if (y & 1) pixel_ptr += s->line_inc; + + // load values for second half + if (y == 7) { + memcpy(P, P + 4, 8); + flags = bytestream2_get_le64(&s->stream_ptr); + } } } @@ -850,11 +784,9 @@ static int ipvideo_decode_block_opcode_0xB_16(IpvideoContext *s) uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 64-color encoding (each pixel in block is a different color) */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 128); - for (y = 0; y < 8; y++) { for (x = 0; x < 8; x++) - pixel_ptr[x] = bytestream_get_le16(&s->stream_ptr); + pixel_ptr[x] = bytestream2_get_le16(&s->stream_ptr); pixel_ptr += s->stride; } @@ -868,14 +800,12 @@ static int ipvideo_decode_block_opcode_0xC_16(IpvideoContext *s) uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 16-color block encoding: each 2x2 block is a different color */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 32); - for (y = 0; y < 8; y += 2) { for (x = 0; x < 8; x += 2) { pixel_ptr[x ] = pixel_ptr[x + 1 ] = pixel_ptr[x + s->stride] = - pixel_ptr[x + 1 + s->stride] = bytestream_get_le16(&s->stream_ptr); + pixel_ptr[x + 1 + s->stride] = bytestream2_get_le16(&s->stream_ptr); } pixel_ptr += s->stride * 2; } @@ -891,12 +821,10 @@ static int ipvideo_decode_block_opcode_0xD_16(IpvideoContext *s) uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 4-color block encoding: each 4x4 block is a different color */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); - for (y = 0; y < 8; y++) { if (!(y & 3)) { - P[0] = bytestream_get_le16(&s->stream_ptr); - P[1] = bytestream_get_le16(&s->stream_ptr); + P[0] = bytestream2_get_le16(&s->stream_ptr); + P[1] = bytestream2_get_le16(&s->stream_ptr); } for (x = 0; x < 8; x++) pixel_ptr[x] = P[x >> 2]; @@ -914,8 +842,7 @@ static int ipvideo_decode_block_opcode_0xE_16(IpvideoContext *s) uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; /* 1-color encoding: the whole block is 1 solid color */ - CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); - pix = bytestream_get_le16(&s->stream_ptr); + pix = bytestream2_get_le16(&s->stream_ptr); for (y = 0; y < 8; y++) { for (x = 0; x < 8; x++) @@ -960,19 +887,16 @@ static void ipvideo_decode_opcodes(IpvideoContext *s) av_dlog(NULL, "------------------ frame %d\n", frame); frame++; + bytestream2_skip(&s->stream_ptr, 14); /* data starts 14 bytes in */ if (!s->is_16bpp) { /* this is PAL8, so make the palette available */ memcpy(s->current_frame.data[1], s->pal, AVPALETTE_SIZE); s->stride = s->current_frame.linesize[0]; - s->stream_ptr = s->buf + 14; /* data starts 14 bytes in */ - s->stream_end = s->buf + s->size; } else { s->stride = s->current_frame.linesize[0] >> 1; - s->stream_ptr = s->buf + 16; - s->stream_end = - s->mv_ptr = s->buf + 14 + AV_RL16(s->buf+14); - s->mv_end = s->buf + s->size; + s->mv_ptr = s->stream_ptr; + bytestream2_skip(&s->mv_ptr, bytestream2_get_le16(&s->stream_ptr)); } s->line_inc = s->stride - 8; s->upper_motion_limit_offset = (s->avctx->height - 8) * s->current_frame.linesize[0] @@ -1002,9 +926,10 @@ static void ipvideo_decode_opcodes(IpvideoContext *s) } } } - if (s->stream_end - s->stream_ptr > 1) { - av_log(s->avctx, AV_LOG_ERROR, " Interplay video: decode finished with %td bytes left over\n", - s->stream_end - s->stream_ptr); + if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) { + av_log(s->avctx, AV_LOG_ERROR, + "Interplay video: decode finished with %d bytes left over\n", + bytestream2_get_bytes_left(&s->stream_ptr)); } } @@ -1042,8 +967,8 @@ static int ipvideo_decode_frame(AVCodecContext *avctx, return buf_size; s->decoding_map = buf; - s->buf = buf + s->decoding_map_size; - s->size = buf_size - s->decoding_map_size; + bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size, + buf_size - s->decoding_map_size); s->current_frame.reference = 3; if (avctx->get_buffer(avctx, &s->current_frame)) { From 3a3f06b05eee7c3e9da781864a510eda1ff587b8 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 18 Mar 2012 19:33:04 -0700 Subject: [PATCH 07/12] dpcm: convert to bytestream2. --- libavcodec/dpcm.c | 58 +++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/libavcodec/dpcm.c b/libavcodec/dpcm.c index 7f5dbfe3b9..2bd7978317 100644 --- a/libavcodec/dpcm.c +++ b/libavcodec/dpcm.c @@ -40,6 +40,7 @@ #include "libavutil/intreadwrite.h" #include "avcodec.h" #include "bytestream.h" +#include "mathops.h" typedef struct DPCMContext { AVFrame frame; @@ -173,20 +174,18 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx) static int dpcm_decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr, AVPacket *avpkt) { - const uint8_t *buf = avpkt->data; int buf_size = avpkt->size; - const uint8_t *buf_end = buf + buf_size; DPCMContext *s = avctx->priv_data; int out = 0, ret; int predictor[2]; int ch = 0; int stereo = s->channels - 1; - int16_t *output_samples; + int16_t *output_samples, *samples_end; + GetByteContext gb; - if (stereo && (buf_size & 1)) { + if (stereo && (buf_size & 1)) buf_size--; - buf_end--; - } + bytestream2_init(&gb, avpkt->data, buf_size); /* calculate output size */ switch(avctx->codec->id) { @@ -218,22 +217,23 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, return ret; } output_samples = (int16_t *)s->frame.data[0]; + samples_end = output_samples + out; switch(avctx->codec->id) { case CODEC_ID_ROQ_DPCM: - buf += 6; + bytestream2_skipu(&gb, 6); if (stereo) { - predictor[1] = (int16_t)(bytestream_get_byte(&buf) << 8); - predictor[0] = (int16_t)(bytestream_get_byte(&buf) << 8); + predictor[1] = sign_extend(bytestream2_get_byteu(&gb) << 8, 16); + predictor[0] = sign_extend(bytestream2_get_byteu(&gb) << 8, 16); } else { - predictor[0] = (int16_t)bytestream_get_le16(&buf); + predictor[0] = sign_extend(bytestream2_get_le16u(&gb), 16); } /* decode the samples */ - while (buf < buf_end) { - predictor[ch] += s->roq_square_array[*buf++]; + while (output_samples < samples_end) { + predictor[ch] += s->roq_square_array[bytestream2_get_byteu(&gb)]; predictor[ch] = av_clip_int16(predictor[ch]); *output_samples++ = predictor[ch]; @@ -243,16 +243,16 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, break; case CODEC_ID_INTERPLAY_DPCM: - buf += 6; /* skip over the stream mask and stream length */ + bytestream2_skipu(&gb, 6); /* skip over the stream mask and stream length */ for (ch = 0; ch < s->channels; ch++) { - predictor[ch] = (int16_t)bytestream_get_le16(&buf); + predictor[ch] = sign_extend(bytestream2_get_le16u(&gb), 16); *output_samples++ = predictor[ch]; } ch = 0; - while (buf < buf_end) { - predictor[ch] += interplay_delta_table[*buf++]; + while (output_samples < samples_end) { + predictor[ch] += interplay_delta_table[bytestream2_get_byteu(&gb)]; predictor[ch] = av_clip_int16(predictor[ch]); *output_samples++ = predictor[ch]; @@ -266,16 +266,19 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, int shift[2] = { 4, 4 }; for (ch = 0; ch < s->channels; ch++) - predictor[ch] = (int16_t)bytestream_get_le16(&buf); + predictor[ch] = sign_extend(bytestream2_get_le16u(&gb), 16); ch = 0; - while (buf < buf_end) { - uint8_t n = *buf++; - int16_t diff = (n & 0xFC) << 8; - if ((n & 0x03) == 3) + while (output_samples < samples_end) { + int diff = bytestream2_get_byteu(&gb); + int n = diff & 3; + + if (n == 3) shift[ch]++; else - shift[ch] -= (2 * (n & 3)); + shift[ch] -= (2 * n); + diff = sign_extend((diff &~ 3) << 8, 16); + /* saturate the shifter to a lower limit of 0 */ if (shift[ch] < 0) shift[ch] = 0; @@ -293,9 +296,10 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, } case CODEC_ID_SOL_DPCM: if (avctx->codec_tag != 3) { - uint8_t *output_samples_u8 = s->frame.data[0]; - while (buf < buf_end) { - uint8_t n = *buf++; + uint8_t *output_samples_u8 = s->frame.data[0], + *samples_end_u8 = output_samples_u8 + out; + while (output_samples_u8 < samples_end_u8) { + int n = bytestream2_get_byteu(&gb); s->sample[0] += s->sol_table[n >> 4]; s->sample[0] = av_clip_uint8(s->sample[0]); @@ -306,8 +310,8 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, *output_samples_u8++ = s->sample[stereo]; } } else { - while (buf < buf_end) { - uint8_t n = *buf++; + while (output_samples < samples_end) { + int n = bytestream2_get_byteu(&gb); if (n & 0x80) s->sample[ch] -= sol_table_16[n & 0x7F]; else s->sample[ch] += sol_table_16[n & 0x7F]; s->sample[ch] = av_clip_int16(s->sample[ch]); From 5d115c1da76a8ab935a74ff76e8189ca05a9f048 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Fri, 23 Mar 2012 12:20:54 +0100 Subject: [PATCH 08/12] Ignore generated files below doc/. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index e08648ba03..dfc1355d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ avconv avplay avprobe avserver +doc/avoptions_codec.texi +doc/avoptions_format.texi +doc/print_options libavcodec/*_tablegen libavcodec/*_tables.c libavcodec/*_tables.h From 3816642eabe11e78d81a9ef90e9d0ad53a6819d0 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Fri, 9 Mar 2012 00:59:06 +0100 Subject: [PATCH 09/12] dsputil_mmx: Surround QPEL macros by "do { } while (0);" blocks. This makes them safe to use in non-fully braced if-blocks and similar. --- libavcodec/x86/dsputil_mmx.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 3cc0f6b8cc..7d9bb99611 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2337,6 +2337,7 @@ extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, const float *src1, int len); #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ + do { \ c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ @@ -2352,25 +2353,32 @@ extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU + c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ + } while (0) #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ + do { \ c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU + c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU;\ + } while (0) #define H264_QPEL_FUNCS(x, y, CPU) \ + do { \ c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU; \ c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU; \ c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU; \ - c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU + c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; \ + } while (0) #define H264_QPEL_FUNCS_10(x, y, CPU) \ + do { \ c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU; \ c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU; \ c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU; \ - c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; \ + } while (0) static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) { From 915a2a0a656518ab50fe28754f9016772c835c8c Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sun, 18 Dec 2011 14:10:33 +0100 Subject: [PATCH 10/12] x86: conditionally compile H.264 QPEL optimizations --- configure | 15 ++++++++------- libavcodec/x86/Makefile | 2 +- libavcodec/x86/dsputil_mmx.c | 18 +++++++++++++----- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/configure b/configure index 8c227aae92..89a4fcea4a 100755 --- a/configure +++ b/configure @@ -1165,6 +1165,7 @@ CONFIG_EXTRA=" h264chroma h264dsp h264pred + h264qpel huffman lgplv3 lpc @@ -1311,7 +1312,7 @@ h263_encoder_select="aandct" h263_vaapi_hwaccel_select="vaapi h263_decoder" h263i_decoder_select="h263_decoder" h263p_encoder_select="h263_encoder" -h264_decoder_select="golomb h264chroma h264dsp h264pred" +h264_decoder_select="golomb h264chroma h264dsp h264pred h264qpel" h264_dxva2_hwaccel_deps="dxva2api_h" h264_dxva2_hwaccel_select="dxva2 h264_decoder" h264_vaapi_hwaccel_select="vaapi h264_decoder" @@ -1366,14 +1367,14 @@ rv10_decoder_select="h263_decoder" rv10_encoder_select="h263_encoder" rv20_decoder_select="h263_decoder" rv20_encoder_select="h263_encoder" -rv30_decoder_select="golomb h264chroma h264pred" -rv40_decoder_select="golomb h264chroma h264pred" +rv30_decoder_select="golomb h264chroma h264pred h264qpel" +rv40_decoder_select="golomb h264chroma h264pred h264qpel" shorten_decoder_select="golomb" sipr_decoder_select="lsp" snow_decoder_select="dwt" snow_encoder_select="aandct dwt" svq1_encoder_select="aandct" -svq3_decoder_select="golomb h264chroma h264dsp h264pred" +svq3_decoder_select="golomb h264chroma h264dsp h264pred h264qpel" svq3_decoder_suggest="zlib" theora_decoder_select="vp3_decoder" tiff_decoder_suggest="zlib" @@ -1381,7 +1382,7 @@ tiff_encoder_suggest="zlib" truehd_decoder_select="mlp_decoder" tscc_decoder_select="zlib" twinvq_decoder_select="mdct lsp sinewin" -vc1_decoder_select="h263_decoder h264chroma" +vc1_decoder_select="h263_decoder h264chroma h264qpel" vc1_dxva2_hwaccel_deps="dxva2api_h" vc1_dxva2_hwaccel_select="dxva2 vc1_decoder" vc1_vaapi_hwaccel_select="vaapi vc1_decoder" @@ -1392,7 +1393,7 @@ vorbis_encoder_select="mdct" vp6_decoder_select="huffman" vp6a_decoder_select="vp6_decoder" vp6f_decoder_select="vp6_decoder" -vp8_decoder_select="h264pred" +vp8_decoder_select="h264pred h264qpel" wmapro_decoder_select="mdct sinewin" wmav1_decoder_select="mdct sinewin" wmav1_encoder_select="mdct sinewin" @@ -1419,7 +1420,7 @@ vda_deps="VideoDecodeAcceleration_VDADecoder_h pthreads" vdpau_deps="vdpau_vdpau_h vdpau_vdpau_x11_h" # parsers -h264_parser_select="golomb h264chroma h264dsp h264pred" +h264_parser_select="golomb h264chroma h264dsp h264pred h264qpel" # external libraries libdirac_decoder_deps="libdirac !libschroedinger" diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index e64697aa2b..7944799f1c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -23,6 +23,7 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ x86/h264_intrapred_10bit.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o +YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_10bit.o MMX-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o @@ -62,7 +63,6 @@ MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ x86/deinterlace.o \ x86/fmtconvert.o \ - x86/h264_qpel_10bit.o \ $(YASM-OBJS-yes) MMX-OBJS-$(CONFIG_FFT) += x86/fft.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 7d9bb99611..665eec96d1 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2479,6 +2479,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } + if (CONFIG_H264QPEL) { SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); @@ -2510,6 +2511,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); + } #if HAVE_YASM if (!high_bit_depth && CONFIG_H264CHROMA) { @@ -2577,6 +2579,7 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; } + if (CONFIG_H264QPEL) { SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); @@ -2597,6 +2600,7 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); + } #if HAVE_YASM if (!high_bit_depth && CONFIG_H264CHROMA) { @@ -2671,11 +2675,12 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, c->put_pixels_tab[0][0] = put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2; c->avg_pixels_tab[0][0] = avg_pixels16_sse2; - H264_QPEL_FUNCS(0, 0, sse2); + if (CONFIG_H264QPEL) + H264_QPEL_FUNCS(0, 0, sse2); } } - if (!high_bit_depth) { + if (!high_bit_depth && CONFIG_H264QPEL) { H264_QPEL_FUNCS(0, 1, sse2); H264_QPEL_FUNCS(0, 2, sse2); H264_QPEL_FUNCS(0, 3, sse2); @@ -2692,6 +2697,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, #if HAVE_YASM if (bit_depth == 10) { + if (CONFIG_H264QPEL) { SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); @@ -2699,7 +2705,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, H264_QPEL_FUNCS_10(1, 0, sse2_cache64); H264_QPEL_FUNCS_10(2, 0, sse2_cache64); H264_QPEL_FUNCS_10(3, 0, sse2_cache64); - + } if (CONFIG_H264CHROMA) { c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; @@ -2729,7 +2735,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, const int high_bit_depth = avctx->bits_per_raw_sample > 8; const int bit_depth = avctx->bits_per_raw_sample; - if (!high_bit_depth) { + if (!high_bit_depth && CONFIG_H264QPEL) { H264_QPEL_FUNCS(1, 0, ssse3); H264_QPEL_FUNCS(1, 1, ssse3); H264_QPEL_FUNCS(1, 2, ssse3); @@ -2744,7 +2750,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, H264_QPEL_FUNCS(3, 3, ssse3); } #if HAVE_YASM - else if (bit_depth == 10) { + else if (bit_depth == 10 && CONFIG_H264QPEL) { H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); @@ -2788,9 +2794,11 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) if (bit_depth == 10) { // AVX implies !cache64. // TODO: Port cache(32|64) detection from x264. + if (CONFIG_H264QPEL) { H264_QPEL_FUNCS_10(1, 0, sse2); H264_QPEL_FUNCS_10(2, 0, sse2); H264_QPEL_FUNCS_10(3, 0, sse2); + } if (CONFIG_H264CHROMA) { c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; From 3b54912113f8b3a5d8c70368b2b759be773b4b3f Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 6 Mar 2012 13:00:42 +0100 Subject: [PATCH 11/12] x86: K&R prettyprinting cosmetics for dsputil_mmx.c --- libavcodec/x86/dsputil_mmx.c | 1822 +++++++++++++++++++--------------- 1 file changed, 1049 insertions(+), 773 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 665eec96d1..bb9ad7854e 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -3,6 +3,8 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer * + * MMX optimization by Nick Kurshev + * * This file is part of Libav. * * Libav is free software; you can redistribute it and/or @@ -18,8 +20,6 @@ * You should have received a copy of the GNU Lesser General Public * License along with Libav; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev */ #include "libavutil/cpu.h" @@ -40,46 +40,46 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = -{0x8000000080000000ULL, 0x8000000080000000ULL}; - -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL}; - -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; + { 0x8000000080000000ULL, 0x8000000080000000ULL }; + +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; + +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; @@ -162,7 +162,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; /***********************************/ /* MMX no rounding */ -#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx +#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx #define SET_RND MOVQ_WONE #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) @@ -177,7 +177,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; /***********************************/ /* MMX rounding */ -#define DEF(x, y) x ## _ ## y ##_mmx +#define DEF(x, y) x ## _ ## y ## _mmx #define SET_RND MOVQ_WTWO #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) @@ -234,13 +234,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; /***********************************/ /* standard MMX */ -void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, + int line_size) { const DCTELEM *p; uint8_t *pix; /* read the pixels */ - p = block; + p = block; pix = pixels; /* unrolled loop */ __asm__ volatile( @@ -262,8 +263,8 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_s "movq %%mm6, (%0, %2) \n\t" ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) :"memory"); - pix += line_size*4; - p += 32; + pix += line_size * 4; + p += 32; // if here would be an exact copy of the code above // compiler would generate some very strange code @@ -307,7 +308,8 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_s "movq %%mm3, (%0, %3, 2) \n\t"\ "movq %%mm4, (%0, %1) \n\t" -void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, + int line_size) { x86_reg line_skip = line_size; x86_reg line_skip3; @@ -323,14 +325,15 @@ void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int :"memory"); } -void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, + int line_size) { const DCTELEM *p; uint8_t *pix; int i; /* read the pixels */ - p = block; + p = block; pix = pixels; MOVQ_ZERO(mm7); i = 4; @@ -359,12 +362,13 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_s :"+m"(*pix), "+m"(*(pix+line_size)) :"r"(p) :"memory"); - pix += line_size*2; - p += 16; + pix += line_size * 2; + p += 16; } while (--i); } -static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { __asm__ volatile( "lea (%3, %3), %%"REG_a" \n\t" @@ -390,7 +394,8 @@ static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size ); } -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { __asm__ volatile( "lea (%3, %3), %%"REG_a" \n\t" @@ -416,7 +421,8 @@ static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size ); } -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { __asm__ volatile( "lea (%3, %3), %%"REG_a" \n\t" @@ -450,7 +456,8 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz ); } -static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { __asm__ volatile( "1: \n\t" @@ -472,7 +479,8 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si ); } -static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { __asm__ volatile( "1: \n\t" @@ -537,7 +545,7 @@ static void clear_block_sse(DCTELEM *block) } static void clear_blocks_sse(DCTELEM *blocks) -{\ +{ __asm__ volatile( "xorps %%xmm0, %%xmm0 \n" "mov %1, %%"REG_a" \n" @@ -558,8 +566,9 @@ static void clear_blocks_sse(DCTELEM *blocks) ); } -static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ - x86_reg i=0; +static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) +{ + x86_reg i = 0; __asm__ volatile( "jmp 2f \n\t" "1: \n\t" @@ -578,15 +587,18 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ : "+r" (i) : "r"(src), "r"(dst), "r"((x86_reg)w-15) ); - for(; i= h){ - src_y_add = h-1-src_y; - src_y=h-1; - }else if(src_y<=-block_h){ - src_y_add = 1-block_h-src_y; - src_y=1-block_h; + int start_y, start_x, end_y, end_x, src_y_add = 0; + + if (src_y >= h) { + src_y_add = h - 1 - src_y; + src_y = h - 1; + } else if (src_y <= -block_h) { + src_y_add = 1 - block_h - src_y; + src_y = 1 - block_h; } - if(src_x>= w){ - src+= (w-1-src_x); - src_x=w-1; - }else if(src_x<=-block_w){ - src+= (1-block_w-src_x); - src_x=1-block_w; + if (src_x >= w) { + src += w - 1 - src_x; + src_x = w - 1; + } else if (src_x <= -block_w) { + src += 1 - block_w - src_x; + src_x = 1 - block_w; } - start_y= FFMAX(0, -src_y); - start_x= FFMAX(0, -src_x); - end_y= FFMIN(block_h, h-src_y); - end_x= FFMIN(block_w, w-src_x); + start_y = FFMAX(0, -src_y); + start_x = FFMAX(0, -src_x); + end_y = FFMIN(block_h, h-src_y); + end_x = FFMIN(block_w, w-src_x); assert(start_x < end_x && block_w > 0); assert(start_y < end_y && block_h > 0); // fill in the to-be-copied part plus all above/below - src += (src_y_add+start_y)*linesize + start_x; + src += (src_y_add + start_y) * linesize + start_x; buf += start_x; - core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w); + core_fn(buf, src, linesize, start_y, end_y, + block_h, start_x, end_x, block_w); } #if ARCH_X86_32 -static av_noinline -void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize, - int block_w, int block_h, - int src_x, int src_y, int w, int h) +static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, + int linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h) { emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, w, h, &ff_emu_edge_core_mmx); } #endif -static av_noinline -void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize, - int block_w, int block_h, - int src_x, int src_y, int w, int h) + +static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, + int linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h) { emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, w, h, &ff_emu_edge_core_sse); } #endif /* HAVE_YASM */ -typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src, - int linesize, int block_w, int block_h, - int src_x, int src_y, int w, int h); +typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, + int linesize, int block_w, int block_h, + int src_x, int src_y, int w, int h); -static av_always_inline -void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height, - emulated_edge_mc_func *emu_edge_fn) +static av_always_inline void gmc(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height, + emulated_edge_mc_func *emu_edge_fn) { - const int w = 8; - const int ix = ox>>(16+shift); - const int iy = oy>>(16+shift); - const int oxs = ox>>4; - const int oys = oy>>4; - const int dxxs = dxx>>4; - const int dxys = dxy>>4; - const int dyxs = dyx>>4; - const int dyys = dyy>>4; - const uint16_t r4[4] = {r,r,r,r}; - const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; - const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; - const uint64_t shift2 = 2*shift; - uint8_t edge_buf[(h+1)*stride]; + const int w = 8; + const int ix = ox >> (16 + shift); + const int iy = oy >> (16 + shift); + const int oxs = ox >> 4; + const int oys = oy >> 4; + const int dxxs = dxx >> 4; + const int dxys = dxy >> 4; + const int dyxs = dyx >> 4; + const int dyys = dyy >> 4; + const uint16_t r4[4] = { r, r, r, r }; + const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; + const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; + const uint64_t shift2 = 2 * shift; + uint8_t edge_buf[(h + 1) * stride]; int x, y; - const int dxw = (dxx-(1<<(16+shift)))*(w-1); - const int dyh = (dyy-(1<<(16+shift)))*(h-1); - const int dxh = dxy*(h-1); - const int dyw = dyx*(w-1); - if( // non-constant fullpel offset (3% of blocks) - ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | - (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) + const int dxw = (dxx - (1 << (16 + shift))) * (w - 1); + const int dyh = (dyy - (1 << (16 + shift))) * (h - 1); + const int dxh = dxy * (h - 1); + const int dyw = dyx * (w - 1); + if ( // non-constant fullpel offset (3% of blocks) + ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) | + (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) // uses more than 16 bits of subpel mv (only at huge resolution) - || (dxx|dxy|dyx|dyy)&15 ) - { + || (dxx | dxy | dyx | dyy) & 15) { //FIXME could still use mmx for some of the rows - ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); + ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, + shift, r, width, height); return; } - src += ix + iy*stride; - if( (unsigned)ix >= width-w || - (unsigned)iy >= height-h ) - { - emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); + src += ix + iy * stride; + if ((unsigned)ix >= width - w || + (unsigned)iy >= height - h) { + emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height); src = edge_buf; } @@ -1692,17 +1902,17 @@ void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, :: "r"(1<PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ @@ -2356,28 +2634,28 @@ extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ } while (0) -#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ - do { \ - c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ - c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU;\ +#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ + do { \ + c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ + c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ } while (0) -#define H264_QPEL_FUNCS(x, y, CPU) \ - do { \ - c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU; \ - c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU; \ - c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU; \ - c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; \ +#define H264_QPEL_FUNCS(x, y, CPU) \ + do { \ + c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \ + c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \ + c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \ + c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \ } while (0) -#define H264_QPEL_FUNCS_10(x, y, CPU) \ - do { \ - c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU; \ - c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU; \ - c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU; \ - c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; \ +#define H264_QPEL_FUNCS_10(x, y, CPU) \ + do { \ + c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ + c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ + c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ + c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ } while (0) static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) @@ -2393,18 +2671,18 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) c->clear_blocks = clear_blocks_mmx; c->draw_edges = draw_edges_mmx; - SET_HPEL_FUNCS(put, 0, 16, mmx); + SET_HPEL_FUNCS(put, 0, 16, mmx); SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); - SET_HPEL_FUNCS(avg, 0, 16, mmx); + SET_HPEL_FUNCS(avg, 0, 16, mmx); SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); - SET_HPEL_FUNCS(put, 1, 8, mmx); - SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); - SET_HPEL_FUNCS(avg, 1, 8, mmx); - SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); + SET_HPEL_FUNCS(put, 1, 8, mmx); + SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); + SET_HPEL_FUNCS(avg, 1, 8, mmx); + SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); } #if ARCH_X86_32 || !HAVE_YASM - c->gmc= gmc_mmx; + c->gmc = gmc_mmx; #endif #if ARCH_X86_32 && HAVE_YASM if (!high_bit_depth) @@ -2469,48 +2747,47 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2; } } - if (CONFIG_VP3_DECODER && HAVE_YASM) { + if (CONFIG_VP3_DECODER && HAVE_YASM) c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; - } - if (CONFIG_VP3_DECODER - && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { + if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 || + avctx->codec_id == CODEC_ID_THEORA)) { c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } if (CONFIG_H264QPEL) { - SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); - SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); - if (!high_bit_depth) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); - } else if (bit_depth == 10) { + if (!high_bit_depth) { + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); + } else if (bit_depth == 10) { #if HAVE_YASM #if !ARCH_X86_64 - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); #endif - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); #endif - } + } - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); } #if HAVE_YASM @@ -2527,7 +2804,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; } - c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; + c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; @@ -2573,33 +2850,33 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, } } - if (CONFIG_VP3_DECODER - && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { + if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 || + avctx->codec_id == CODEC_ID_THEORA)) { c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow; c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; } if (CONFIG_H264QPEL) { - SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); - SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); - if (!high_bit_depth) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); - } + if (!high_bit_depth) { + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); + } - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); } #if HAVE_YASM @@ -2632,7 +2909,7 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags) const int high_bit_depth = avctx->bits_per_raw_sample > 8; if (!high_bit_depth) { - if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ + if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) { /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ c->clear_block = clear_block_sse; c->clear_blocks = clear_blocks_sse; @@ -2698,13 +2975,13 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, #if HAVE_YASM if (bit_depth == 10) { if (CONFIG_H264QPEL) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); - H264_QPEL_FUNCS_10(1, 0, sse2_cache64); - H264_QPEL_FUNCS_10(2, 0, sse2_cache64); - H264_QPEL_FUNCS_10(3, 0, sse2_cache64); + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); + H264_QPEL_FUNCS_10(1, 0, sse2_cache64); + H264_QPEL_FUNCS_10(2, 0, sse2_cache64); + H264_QPEL_FUNCS_10(3, 0, sse2_cache64); } if (CONFIG_H264CHROMA) { c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; @@ -2721,7 +2998,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, } if (avctx->flags & CODEC_FLAG_BITEXACT) { c->apply_window_int16 = ff_apply_window_int16_sse2_ba; - } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { + } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { c->apply_window_int16 = ff_apply_window_int16_sse2; } c->bswap_buf = ff_bswap32_buf_sse2; @@ -2765,14 +3042,12 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; - if (mm_flags & AV_CPU_FLAG_ATOM) { + if (mm_flags & AV_CPU_FLAG_ATOM) c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; - } else { + else c->apply_window_int16 = ff_apply_window_int16_ssse3; - } - if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit + if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; - } c->bswap_buf = ff_bswap32_buf_ssse3; #endif #endif @@ -2795,9 +3070,9 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) // AVX implies !cache64. // TODO: Port cache(32|64) detection from x264. if (CONFIG_H264QPEL) { - H264_QPEL_FUNCS_10(1, 0, sse2); - H264_QPEL_FUNCS_10(2, 0, sse2); - H264_QPEL_FUNCS_10(3, 0, sse2); + H264_QPEL_FUNCS_10(1, 0, sse2); + H264_QPEL_FUNCS_10(2, 0, sse2); + H264_QPEL_FUNCS_10(3, 0, sse2); } if (CONFIG_H264CHROMA) { @@ -2809,13 +3084,13 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) #endif } -void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) +void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); if (avctx->dsp_mask) { if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) - mm_flags |= (avctx->dsp_mask & 0xffff); + mm_flags |= avctx->dsp_mask & 0xffff; else mm_flags &= ~(avctx->dsp_mask & 0xffff); } @@ -2836,56 +3111,57 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif if (mm_flags & AV_CPU_FLAG_MMX) { - const int idct_algo= avctx->idct_algo; + const int idct_algo = avctx->idct_algo; if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { - if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ - c->idct_put= ff_simple_idct_put_mmx; - c->idct_add= ff_simple_idct_add_mmx; - c->idct = ff_simple_idct_mmx; - c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; + if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { + c->idct_put = ff_simple_idct_put_mmx; + c->idct_add = ff_simple_idct_add_mmx; + c->idct = ff_simple_idct_mmx; + c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; #if CONFIG_GPL - }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ - if(mm_flags & AV_CPU_FLAG_MMX2){ - c->idct_put= ff_libmpeg2mmx2_idct_put; - c->idct_add= ff_libmpeg2mmx2_idct_add; - c->idct = ff_mmxext_idct; - }else{ - c->idct_put= ff_libmpeg2mmx_idct_put; - c->idct_add= ff_libmpeg2mmx_idct_add; - c->idct = ff_mmx_idct; + } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) { + if (mm_flags & AV_CPU_FLAG_MMX2) { + c->idct_put = ff_libmpeg2mmx2_idct_put; + c->idct_add = ff_libmpeg2mmx2_idct_add; + c->idct = ff_mmxext_idct; + } else { + c->idct_put = ff_libmpeg2mmx_idct_put; + c->idct_add = ff_libmpeg2mmx_idct_add; + c->idct = ff_mmx_idct; } - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; #endif - }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && - idct_algo==FF_IDCT_VP3 && HAVE_YASM){ - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->idct_put= ff_vp3_idct_put_sse2; - c->idct_add= ff_vp3_idct_add_sse2; - c->idct = ff_vp3_idct_sse2; - c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; - }else{ - c->idct_put= ff_vp3_idct_put_mmx; - c->idct_add= ff_vp3_idct_add_mmx; - c->idct = ff_vp3_idct_mmx; - c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; + } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || + CONFIG_VP6_DECODER) && + idct_algo == FF_IDCT_VP3 && HAVE_YASM) { + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->idct_put = ff_vp3_idct_put_sse2; + c->idct_add = ff_vp3_idct_add_sse2; + c->idct = ff_vp3_idct_sse2; + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } else { + c->idct_put = ff_vp3_idct_put_mmx; + c->idct_add = ff_vp3_idct_add_mmx; + c->idct = ff_vp3_idct_mmx; + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; } - }else if(idct_algo==FF_IDCT_CAVS){ - c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; - }else if(idct_algo==FF_IDCT_XVIDMMX){ - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->idct_put= ff_idct_xvid_sse2_put; - c->idct_add= ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type= FF_SSE2_IDCT_PERM; - }else if(mm_flags & AV_CPU_FLAG_MMX2){ - c->idct_put= ff_idct_xvid_mmx2_put; - c->idct_add= ff_idct_xvid_mmx2_add; - c->idct = ff_idct_xvid_mmx2; - }else{ - c->idct_put= ff_idct_xvid_mmx_put; - c->idct_add= ff_idct_xvid_mmx_add; - c->idct = ff_idct_xvid_mmx; + } else if (idct_algo == FF_IDCT_CAVS) { + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } else if (idct_algo == FF_IDCT_XVIDMMX) { + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->idct_put = ff_idct_xvid_sse2_put; + c->idct_add = ff_idct_xvid_sse2_add; + c->idct = ff_idct_xvid_sse2; + c->idct_permutation_type = FF_SSE2_IDCT_PERM; + } else if (mm_flags & AV_CPU_FLAG_MMX2) { + c->idct_put = ff_idct_xvid_mmx2_put; + c->idct_add = ff_idct_xvid_mmx2_add; + c->idct = ff_idct_xvid_mmx2; + } else { + c->idct_put = ff_idct_xvid_mmx_put; + c->idct_add = ff_idct_xvid_mmx_add; + c->idct = ff_idct_xvid_mmx; } } } @@ -2896,13 +3172,13 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) if (mm_flags & AV_CPU_FLAG_MMX2) dsputil_init_mmx2(c, avctx, mm_flags); - if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) + if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) dsputil_init_3dnow(c, avctx, mm_flags); - if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) + if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) dsputil_init_3dnow2(c, avctx, mm_flags); - if (HAVE_SSE && (mm_flags & AV_CPU_FLAG_SSE)) + if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) dsputil_init_sse(c, avctx, mm_flags); if (mm_flags & AV_CPU_FLAG_SSE2) From 62ce9defb81d0b6bd179131d1502858c8778f411 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Fri, 16 Mar 2012 18:42:01 +0100 Subject: [PATCH 12/12] x86: dsputil: prettyprint gcc inline asm --- libavcodec/x86/dsputil_mmx.c | 2605 +++++++++++++++++----------------- 1 file changed, 1310 insertions(+), 1295 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index bb9ad7854e..040e37b38d 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -84,81 +84,81 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; -#define JUMPALIGN() __asm__ volatile (".p2align 3"::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) +#define JUMPALIGN() __asm__ volatile (".p2align 3"::) +#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ - "paddb %%" #regd ", %%" #regd " \n\t" ::) +#define MOVQ_BFE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%"#regd", %%"#regd" \n\t" \ + "paddb %%"#regd", %%"#regd" \n\t" ::) #ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) +#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) +#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) #else // for shared library it's better to use this way for accessing constants // pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "packuswb %%" #regd ", %%" #regd " \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "psllw $1, %%" #regd " \n\t"::) +#define MOVQ_BONE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%"#regd", %%"#regd" \n\t" \ + "psrlw $15, %%"#regd" \n\t" \ + "packuswb %%"#regd", %%"#regd" \n\t" ::) + +#define MOVQ_WTWO(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%"#regd", %%"#regd" \n\t" \ + "psrlw $15, %%"#regd" \n\t" \ + "psllw $1, %%"#regd" \n\t"::) #endif // using regr as temporary and for the output result // first argument is unmodifed and second is trashed // regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ - "movq " #rega ", " #regr " \n\t"\ - "pand " #regb ", " #regr " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pand " #regfe "," #regb " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "paddb " #regb ", " #regr " \n\t" - -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq " #rega ", " #regr " \n\t"\ - "por " #regb ", " #regr " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pand " #regfe "," #regb " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psubb " #regb ", " #regr " \n\t" +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ + "movq "#rega", "#regr" \n\t" \ + "pand "#regb", "#regr" \n\t" \ + "pxor "#rega", "#regb" \n\t" \ + "pand "#regfe", "#regb" \n\t" \ + "psrlq $1, "#regb" \n\t" \ + "paddb "#regb", "#regr" \n\t" + +#define PAVGB_MMX(rega, regb, regr, regfe) \ + "movq "#rega", "#regr" \n\t" \ + "por "#regb", "#regr" \n\t" \ + "pxor "#rega", "#regb" \n\t" \ + "pand "#regfe", "#regb" \n\t" \ + "psrlq $1, "#regb" \n\t" \ + "psubb "#regb", "#regr" \n\t" // mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq " #rega ", " #regr " \n\t"\ - "movq " #regc ", " #regp " \n\t"\ - "pand " #regb ", " #regr " \n\t"\ - "pand " #regd ", " #regp " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pxor " #regc ", " #regd " \n\t"\ - "pand %%mm6, " #regb " \n\t"\ - "pand %%mm6, " #regd " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psrlq $1, " #regd " \n\t"\ - "paddb " #regb ", " #regr " \n\t"\ - "paddb " #regd ", " #regp " \n\t" - -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq " #rega ", " #regr " \n\t"\ - "movq " #regc ", " #regp " \n\t"\ - "por " #regb ", " #regr " \n\t"\ - "por " #regd ", " #regp " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pxor " #regc ", " #regd " \n\t"\ - "pand %%mm6, " #regb " \n\t"\ - "pand %%mm6, " #regd " \n\t"\ - "psrlq $1, " #regd " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psubb " #regb ", " #regr " \n\t"\ - "psubb " #regd ", " #regp " \n\t" +#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ + "movq "#rega", "#regr" \n\t" \ + "movq "#regc", "#regp" \n\t" \ + "pand "#regb", "#regr" \n\t" \ + "pand "#regd", "#regp" \n\t" \ + "pxor "#rega", "#regb" \n\t" \ + "pxor "#regc", "#regd" \n\t" \ + "pand %%mm6, "#regb" \n\t" \ + "pand %%mm6, "#regd" \n\t" \ + "psrlq $1, "#regb" \n\t" \ + "psrlq $1, "#regd" \n\t" \ + "paddb "#regb", "#regr" \n\t" \ + "paddb "#regd", "#regp" \n\t" + +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ + "movq "#rega", "#regr" \n\t" \ + "movq "#regc", "#regp" \n\t" \ + "por "#regb", "#regr" \n\t" \ + "por "#regd", "#regp" \n\t" \ + "pxor "#rega", "#regb" \n\t" \ + "pxor "#regc", "#regd" \n\t" \ + "pand %%mm6, "#regb" \n\t" \ + "pand %%mm6, "#regd" \n\t" \ + "psrlq $1, "#regd" \n\t" \ + "psrlq $1, "#regb" \n\t" \ + "psubb "#regb", "#regr" \n\t" \ + "psubb "#regd", "#regp" \n\t" /***********************************/ /* MMX no rounding */ @@ -244,69 +244,70 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, p = block; pix = pixels; /* unrolled loop */ - __asm__ volatile( - "movq %3, %%mm0 \n\t" - "movq 8%3, %%mm1 \n\t" - "movq 16%3, %%mm2 \n\t" - "movq 24%3, %%mm3 \n\t" - "movq 32%3, %%mm4 \n\t" - "movq 40%3, %%mm5 \n\t" - "movq 48%3, %%mm6 \n\t" - "movq 56%3, %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) - :"memory"); + __asm__ volatile ( + "movq %3, %%mm0 \n\t" + "movq 8%3, %%mm1 \n\t" + "movq 16%3, %%mm2 \n\t" + "movq 24%3, %%mm3 \n\t" + "movq 32%3, %%mm4 \n\t" + "movq 40%3, %%mm5 \n\t" + "movq 48%3, %%mm6 \n\t" + "movq 56%3, %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), + "m"(*p) + : "memory"); pix += line_size * 4; p += 32; // if here would be an exact copy of the code above // compiler would generate some very strange code // thus using "r" - __asm__ volatile( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) - :"memory"); -} - -#define put_signed_pixels_clamped_mmx_half(off) \ - "movq "#off"(%2), %%mm1 \n\t"\ - "movq 16+"#off"(%2), %%mm2 \n\t"\ - "movq 32+"#off"(%2), %%mm3 \n\t"\ - "movq 48+"#off"(%2), %%mm4 \n\t"\ - "packsswb 8+"#off"(%2), %%mm1 \n\t"\ - "packsswb 24+"#off"(%2), %%mm2 \n\t"\ - "packsswb 40+"#off"(%2), %%mm3 \n\t"\ - "packsswb 56+"#off"(%2), %%mm4 \n\t"\ - "paddb %%mm0, %%mm1 \n\t"\ - "paddb %%mm0, %%mm2 \n\t"\ - "paddb %%mm0, %%mm3 \n\t"\ - "paddb %%mm0, %%mm4 \n\t"\ - "movq %%mm1, (%0) \n\t"\ - "movq %%mm2, (%0, %3) \n\t"\ - "movq %%mm3, (%0, %3, 2) \n\t"\ - "movq %%mm4, (%0, %1) \n\t" + __asm__ volatile ( + "movq (%3), %%mm0 \n\t" + "movq 8(%3), %%mm1 \n\t" + "movq 16(%3), %%mm2 \n\t" + "movq 24(%3), %%mm3 \n\t" + "movq 32(%3), %%mm4 \n\t" + "movq 40(%3), %%mm5 \n\t" + "movq 48(%3), %%mm6 \n\t" + "movq 56(%3), %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p) + : "memory"); +} + +#define put_signed_pixels_clamped_mmx_half(off) \ + "movq "#off"(%2), %%mm1 \n\t" \ + "movq 16 + "#off"(%2), %%mm2 \n\t" \ + "movq 32 + "#off"(%2), %%mm3 \n\t" \ + "movq 48 + "#off"(%2), %%mm4 \n\t" \ + "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ + "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ + "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ + "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ + "paddb %%mm0, %%mm1 \n\t" \ + "paddb %%mm0, %%mm2 \n\t" \ + "paddb %%mm0, %%mm3 \n\t" \ + "paddb %%mm0, %%mm4 \n\t" \ + "movq %%mm1, (%0) \n\t" \ + "movq %%mm2, (%0, %3) \n\t" \ + "movq %%mm3, (%0, %3, 2) \n\t" \ + "movq %%mm4, (%0, %1) \n\t" void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) @@ -315,14 +316,14 @@ void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, x86_reg line_skip3; __asm__ volatile ( - "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" - "lea (%3, %3, 2), %1 \n\t" - put_signed_pixels_clamped_mmx_half(0) - "lea (%0, %3, 4), %0 \n\t" - put_signed_pixels_clamped_mmx_half(64) - :"+&r" (pixels), "=&r" (line_skip3) - :"r" (block), "r"(line_skip) - :"memory"); + "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" + "lea (%3, %3, 2), %1 \n\t" + put_signed_pixels_clamped_mmx_half(0) + "lea (%0, %3, 4), %0 \n\t" + put_signed_pixels_clamped_mmx_half(64) + : "+&r"(pixels), "=&r"(line_skip3) + : "r"(block), "r"(line_skip) + : "memory"); } void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, @@ -338,30 +339,30 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, MOVQ_ZERO(mm7); i = 4; do { - __asm__ volatile( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - :"+m"(*pix), "+m"(*(pix+line_size)) - :"r"(p) - :"memory"); + __asm__ volatile ( + "movq (%2), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 16(%2), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "movq %0, %%mm4 \n\t" + "movq %1, %%mm6 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm4, %%mm0 \n\t" + "paddsw %%mm5, %%mm1 \n\t" + "movq %%mm6, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm6, %%mm2 \n\t" + "paddsw %%mm5, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %0 \n\t" + "movq %%mm2, %1 \n\t" + : "+m"(*pix), "+m"(*(pix + line_size)) + : "r"(p) + : "memory"); pix += line_size * 2; p += 16; } while (--i); @@ -370,175 +371,175 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%1, %3), %%mm1 \n\t" - "movd %%mm0, (%2) \n\t" - "movd %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%1, %3), %%mm1 \n\t" - "movd %%mm0, (%2) \n\t" - "movd %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" + __asm__ volatile ( + "lea (%3, %3), %%"REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movd (%1 ), %%mm0 \n\t" + "movd (%1, %3), %%mm1 \n\t" + "movd %%mm0, (%2) \n\t" + "movd %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movd (%1 ), %%mm0 \n\t" + "movd (%1, %3), %%mm1 \n\t" + "movd %%mm0, (%2) \n\t" + "movd %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" ); } static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" + __asm__ volatile ( + "lea (%3, %3), %%"REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" ); } static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" + __asm__ volatile ( + "lea (%3, %3), %%"REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq 8(%1 ), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq 8(%1 ), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" ); } static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) { - __asm__ volatile( - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1,%3), %%xmm1 \n\t" - "movdqu (%1,%3,2), %%xmm2 \n\t" - "movdqu (%1,%4), %%xmm3 \n\t" - "lea (%1,%3,4), %1 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2,%3) \n\t" - "movdqa %%xmm2, (%2,%3,2) \n\t" - "movdqa %%xmm3, (%2,%4) \n\t" - "subl $4, %0 \n\t" - "lea (%2,%3,4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) - : "memory" + __asm__ volatile ( + "1: \n\t" + "movdqu (%1 ), %%xmm0 \n\t" + "movdqu (%1, %3 ), %%xmm1 \n\t" + "movdqu (%1, %3, 2), %%xmm2 \n\t" + "movdqu (%1, %4 ), %%xmm3 \n\t" + "lea (%1, %3, 4), %1 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2, %3) \n\t" + "movdqa %%xmm2, (%2, %3, 2) \n\t" + "movdqa %%xmm3, (%2, %4) \n\t" + "subl $4, %0 \n\t" + "lea (%2, %3, 4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size) + : "memory" ); } static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) { - __asm__ volatile( - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1,%3), %%xmm1 \n\t" - "movdqu (%1,%3,2), %%xmm2 \n\t" - "movdqu (%1,%4), %%xmm3 \n\t" - "lea (%1,%3,4), %1 \n\t" - "pavgb (%2), %%xmm0 \n\t" - "pavgb (%2,%3), %%xmm1 \n\t" - "pavgb (%2,%3,2), %%xmm2 \n\t" - "pavgb (%2,%4), %%xmm3 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2,%3) \n\t" - "movdqa %%xmm2, (%2,%3,2) \n\t" - "movdqa %%xmm3, (%2,%4) \n\t" - "subl $4, %0 \n\t" - "lea (%2,%3,4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) - : "memory" + __asm__ volatile ( + "1: \n\t" + "movdqu (%1 ), %%xmm0 \n\t" + "movdqu (%1, %3 ), %%xmm1 \n\t" + "movdqu (%1, %3, 2), %%xmm2 \n\t" + "movdqu (%1, %4 ), %%xmm3 \n\t" + "lea (%1, %3, 4), %1 \n\t" + "pavgb (%2 ), %%xmm0 \n\t" + "pavgb (%2, %3 ), %%xmm1 \n\t" + "pavgb (%2, %3, 2), %%xmm2 \n\t" + "pavgb (%2, %4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2, %3) \n\t" + "movdqa %%xmm2, (%2, %3, 2) \n\t" + "movdqa %%xmm3, (%2, %4) \n\t" + "subl $4, %0 \n\t" + "lea (%2, %3, 4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size) + : "memory" ); } -#define CLEAR_BLOCKS(name,n) \ -static void name(DCTELEM *blocks)\ -{\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "mov %1, %%"REG_a" \n\t"\ - "1: \n\t"\ - "movq %%mm7, (%0, %%"REG_a") \n\t"\ - "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ - "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ - "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ - "add $32, %%"REG_a" \n\t"\ - " js 1b \n\t"\ - : : "r" (((uint8_t *)blocks)+128*n),\ - "i" (-128*n)\ - : "%"REG_a\ - );\ +#define CLEAR_BLOCKS(name, n) \ +static void name(DCTELEM *blocks) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "mov %1, %%"REG_a" \n\t" \ + "1: \n\t" \ + "movq %%mm7, (%0, %%"REG_a") \n\t" \ + "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ + "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ + "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ + "add $32, %%"REG_a" \n\t" \ + "js 1b \n\t" \ + :: "r"(((uint8_t *)blocks) + 128 * n), \ + "i"(-128 * n) \ + : "%"REG_a \ + ); \ } CLEAR_BLOCKS(clear_blocks_mmx, 6) CLEAR_BLOCKS(clear_block_mmx, 1) static void clear_block_sse(DCTELEM *block) { - __asm__ volatile( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" + __asm__ volatile ( + "xorps %%xmm0, %%xmm0 \n" + "movaps %%xmm0, (%0) \n" + "movaps %%xmm0, 16(%0) \n" + "movaps %%xmm0, 32(%0) \n" + "movaps %%xmm0, 48(%0) \n" + "movaps %%xmm0, 64(%0) \n" + "movaps %%xmm0, 80(%0) \n" + "movaps %%xmm0, 96(%0) \n" + "movaps %%xmm0, 112(%0) \n" :: "r"(block) : "memory" ); @@ -546,22 +547,22 @@ static void clear_block_sse(DCTELEM *block) static void clear_blocks_sse(DCTELEM *blocks) { - __asm__ volatile( - "xorps %%xmm0, %%xmm0 \n" - "mov %1, %%"REG_a" \n" - "1: \n" - "movaps %%xmm0, (%0, %%"REG_a") \n" - "movaps %%xmm0, 16(%0, %%"REG_a") \n" - "movaps %%xmm0, 32(%0, %%"REG_a") \n" - "movaps %%xmm0, 48(%0, %%"REG_a") \n" - "movaps %%xmm0, 64(%0, %%"REG_a") \n" - "movaps %%xmm0, 80(%0, %%"REG_a") \n" - "movaps %%xmm0, 96(%0, %%"REG_a") \n" - "movaps %%xmm0, 112(%0, %%"REG_a") \n" - "add $128, %%"REG_a" \n" - " js 1b \n" - : : "r" (((uint8_t *)blocks)+128*6), - "i" (-128*6) + __asm__ volatile ( + "xorps %%xmm0, %%xmm0 \n" + "mov %1, %%"REG_a" \n" + "1: \n" + "movaps %%xmm0, (%0, %%"REG_a") \n" + "movaps %%xmm0, 16(%0, %%"REG_a") \n" + "movaps %%xmm0, 32(%0, %%"REG_a") \n" + "movaps %%xmm0, 48(%0, %%"REG_a") \n" + "movaps %%xmm0, 64(%0, %%"REG_a") \n" + "movaps %%xmm0, 80(%0, %%"REG_a") \n" + "movaps %%xmm0, 96(%0, %%"REG_a") \n" + "movaps %%xmm0, 112(%0, %%"REG_a") \n" + "add $128, %%"REG_a" \n" + "js 1b \n" + :: "r"(((uint8_t *)blocks) + 128 * 6), + "i"(-128 * 6) : "%"REG_a ); } @@ -569,23 +570,23 @@ static void clear_blocks_sse(DCTELEM *blocks) static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) { x86_reg i = 0; - __asm__ volatile( - "jmp 2f \n\t" + __asm__ volatile ( + "jmp 2f \n\t" "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq (%2, %0), %%mm1 \n\t" + "paddb %%mm0, %%mm1 \n\t" + "movq %%mm1, (%2, %0) \n\t" + "movq 8(%1, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb %%mm0, %%mm1 \n\t" + "movq %%mm1, 8(%2, %0) \n\t" + "add $16, %0 \n\t" "2: \n\t" - "cmp %3, %0 \n\t" - " js 1b \n\t" - : "+r" (i) - : "r"(src), "r"(dst), "r"((x86_reg)w-15) + "cmp %3, %0 \n\t" + "js 1b \n\t" + : "+r"(i) + : "r"(src), "r"(dst), "r"((x86_reg)w - 15) ); for ( ; i < w; i++) dst[i + 0] += src[i + 0]; @@ -601,124 +602,123 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, int l = *left & 0xff; int tl = *left_top & 0xff; int t; - __asm__ volatile( - "mov %7, %3 \n" - "1: \n" - "movzbl (%3,%4), %2 \n" - "mov %2, %k3 \n" - "sub %b1, %b3 \n" - "add %b0, %b3 \n" - "mov %2, %1 \n" - "cmp %0, %2 \n" - "cmovg %0, %2 \n" - "cmovg %1, %0 \n" - "cmp %k3, %0 \n" - "cmovg %k3, %0 \n" - "mov %7, %3 \n" - "cmp %2, %0 \n" - "cmovl %2, %0 \n" - "add (%6,%4), %b0 \n" - "mov %b0, (%5,%4) \n" - "inc %4 \n" - "jl 1b \n" - :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) - :"r"(dst+w), "r"(diff+w), "rm"(top+w) + __asm__ volatile ( + "mov %7, %3 \n" + "1: \n" + "movzbl (%3, %4), %2 \n" + "mov %2, %k3 \n" + "sub %b1, %b3 \n" + "add %b0, %b3 \n" + "mov %2, %1 \n" + "cmp %0, %2 \n" + "cmovg %0, %2 \n" + "cmovg %1, %0 \n" + "cmp %k3, %0 \n" + "cmovg %k3, %0 \n" + "mov %7, %3 \n" + "cmp %2, %0 \n" + "cmovl %2, %0 \n" + "add (%6, %4), %b0 \n" + "mov %b0, (%5, %4) \n" + "inc %4 \n" + "jl 1b \n" + : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) + : "r"(dst + w), "r"(diff + w), "rm"(top + w) ); *left = l; *left_top = tl; } #endif -#define H263_LOOP_FILTER \ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm0 \n\t"\ - "movq %0, %%mm1 \n\t"\ - "movq %3, %%mm2 \n\t"\ - "movq %3, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm3, %%mm1 \n\t"\ - "movq %1, %%mm2 \n\t"\ - "movq %1, %%mm3 \n\t"\ - "movq %2, %%mm4 \n\t"\ - "movq %2, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "psubw %%mm2, %%mm4 \n\t"\ - "psubw %%mm3, %%mm5 \n\t"\ - "psllw $2, %%mm4 \n\t"\ - "psllw $2, %%mm5 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm1, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pcmpgtw %%mm4, %%mm6 \n\t"\ - "pcmpgtw %%mm5, %%mm7 \n\t"\ - "pxor %%mm6, %%mm4 \n\t"\ - "pxor %%mm7, %%mm5 \n\t"\ - "psubw %%mm6, %%mm4 \n\t"\ - "psubw %%mm7, %%mm5 \n\t"\ - "psrlw $3, %%mm4 \n\t"\ - "psrlw $3, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm4 \n\t"\ - "packsswb %%mm7, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd %4, %%mm2 \n\t"\ - "punpcklbw %%mm2, %%mm2 \n\t"\ - "punpcklbw %%mm2, %%mm2 \n\t"\ - "punpcklbw %%mm2, %%mm2 \n\t"\ - "psubusb %%mm4, %%mm2 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "psubusb %%mm4, %%mm3 \n\t"\ - "psubb %%mm3, %%mm2 \n\t"\ - "movq %1, %%mm3 \n\t"\ - "movq %2, %%mm4 \n\t"\ - "pxor %%mm6, %%mm3 \n\t"\ - "pxor %%mm6, %%mm4 \n\t"\ - "paddusb %%mm2, %%mm3 \n\t"\ - "psubusb %%mm2, %%mm4 \n\t"\ - "pxor %%mm6, %%mm3 \n\t"\ - "pxor %%mm6, %%mm4 \n\t"\ - "paddusb %%mm2, %%mm2 \n\t"\ - "packsswb %%mm1, %%mm0 \n\t"\ - "pcmpgtb %%mm0, %%mm7 \n\t"\ - "pxor %%mm7, %%mm0 \n\t"\ - "psubb %%mm7, %%mm0 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "psubusb %%mm2, %%mm0 \n\t"\ - "psubb %%mm0, %%mm1 \n\t"\ - "pand %5, %%mm1 \n\t"\ - "psrlw $2, %%mm1 \n\t"\ - "pxor %%mm7, %%mm1 \n\t"\ - "psubb %%mm7, %%mm1 \n\t"\ - "movq %0, %%mm5 \n\t"\ - "movq %3, %%mm6 \n\t"\ - "psubb %%mm1, %%mm5 \n\t"\ - "paddb %%mm1, %%mm6 \n\t" +#define H263_LOOP_FILTER \ + "pxor %%mm7, %%mm7 \n\t" \ + "movq %0, %%mm0 \n\t" \ + "movq %0, %%mm1 \n\t" \ + "movq %3, %%mm2 \n\t" \ + "movq %3, %%mm3 \n\t" \ + "punpcklbw %%mm7, %%mm0 \n\t" \ + "punpckhbw %%mm7, %%mm1 \n\t" \ + "punpcklbw %%mm7, %%mm2 \n\t" \ + "punpckhbw %%mm7, %%mm3 \n\t" \ + "psubw %%mm2, %%mm0 \n\t" \ + "psubw %%mm3, %%mm1 \n\t" \ + "movq %1, %%mm2 \n\t" \ + "movq %1, %%mm3 \n\t" \ + "movq %2, %%mm4 \n\t" \ + "movq %2, %%mm5 \n\t" \ + "punpcklbw %%mm7, %%mm2 \n\t" \ + "punpckhbw %%mm7, %%mm3 \n\t" \ + "punpcklbw %%mm7, %%mm4 \n\t" \ + "punpckhbw %%mm7, %%mm5 \n\t" \ + "psubw %%mm2, %%mm4 \n\t" \ + "psubw %%mm3, %%mm5 \n\t" \ + "psllw $2, %%mm4 \n\t" \ + "psllw $2, %%mm5 \n\t" \ + "paddw %%mm0, %%mm4 \n\t" \ + "paddw %%mm1, %%mm5 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + "pcmpgtw %%mm4, %%mm6 \n\t" \ + "pcmpgtw %%mm5, %%mm7 \n\t" \ + "pxor %%mm6, %%mm4 \n\t" \ + "pxor %%mm7, %%mm5 \n\t" \ + "psubw %%mm6, %%mm4 \n\t" \ + "psubw %%mm7, %%mm5 \n\t" \ + "psrlw $3, %%mm4 \n\t" \ + "psrlw $3, %%mm5 \n\t" \ + "packuswb %%mm5, %%mm4 \n\t" \ + "packsswb %%mm7, %%mm6 \n\t" \ + "pxor %%mm7, %%mm7 \n\t" \ + "movd %4, %%mm2 \n\t" \ + "punpcklbw %%mm2, %%mm2 \n\t" \ + "punpcklbw %%mm2, %%mm2 \n\t" \ + "punpcklbw %%mm2, %%mm2 \n\t" \ + "psubusb %%mm4, %%mm2 \n\t" \ + "movq %%mm2, %%mm3 \n\t" \ + "psubusb %%mm4, %%mm3 \n\t" \ + "psubb %%mm3, %%mm2 \n\t" \ + "movq %1, %%mm3 \n\t" \ + "movq %2, %%mm4 \n\t" \ + "pxor %%mm6, %%mm3 \n\t" \ + "pxor %%mm6, %%mm4 \n\t" \ + "paddusb %%mm2, %%mm3 \n\t" \ + "psubusb %%mm2, %%mm4 \n\t" \ + "pxor %%mm6, %%mm3 \n\t" \ + "pxor %%mm6, %%mm4 \n\t" \ + "paddusb %%mm2, %%mm2 \n\t" \ + "packsswb %%mm1, %%mm0 \n\t" \ + "pcmpgtb %%mm0, %%mm7 \n\t" \ + "pxor %%mm7, %%mm0 \n\t" \ + "psubb %%mm7, %%mm0 \n\t" \ + "movq %%mm0, %%mm1 \n\t" \ + "psubusb %%mm2, %%mm0 \n\t" \ + "psubb %%mm0, %%mm1 \n\t" \ + "pand %5, %%mm1 \n\t" \ + "psrlw $2, %%mm1 \n\t" \ + "pxor %%mm7, %%mm1 \n\t" \ + "psubb %%mm7, %%mm1 \n\t" \ + "movq %0, %%mm5 \n\t" \ + "movq %3, %%mm6 \n\t" \ + "psubb %%mm1, %%mm5 \n\t" \ + "paddb %%mm1, %%mm6 \n\t" static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale) { if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { const int strength = ff_h263_loop_filter_strength[qscale]; - __asm__ volatile( - - H263_LOOP_FILTER - - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %0 \n\t" - "movq %%mm6, %3 \n\t" - : "+m" (*(uint64_t*)(src - 2*stride)), - "+m" (*(uint64_t*)(src - 1*stride)), - "+m" (*(uint64_t*)(src + 0*stride)), - "+m" (*(uint64_t*)(src + 1*stride)) - : "g" (2*strength), "m"(ff_pb_FC) - ); + __asm__ volatile ( + H263_LOOP_FILTER + + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %0 \n\t" + "movq %%mm6, %3 \n\t" + : "+m"(*(uint64_t*)(src - 2 * stride)), + "+m"(*(uint64_t*)(src - 1 * stride)), + "+m"(*(uint64_t*)(src + 0 * stride)), + "+m"(*(uint64_t*)(src + 1 * stride)) + : "g"(2 * strength), "m"(ff_pb_FC) + ); } } @@ -733,46 +733,46 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale) transpose4x4(btemp, src, 8, stride); transpose4x4(btemp + 4, src + 4 * stride, 8, stride); - __asm__ volatile( - H263_LOOP_FILTER // 5 3 4 6 - - : "+m" (temp[0]), - "+m" (temp[1]), - "+m" (temp[2]), - "+m" (temp[3]) - : "g" (2*strength), "m"(ff_pb_FC) - ); + __asm__ volatile ( + H263_LOOP_FILTER // 5 3 4 6 + + : "+m"(temp[0]), + "+m"(temp[1]), + "+m"(temp[2]), + "+m"(temp[3]) + : "g"(2 * strength), "m"(ff_pb_FC) + ); - __asm__ volatile( - "movq %%mm5, %%mm1 \n\t" - "movq %%mm4, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm5 \n\t" - "punpcklbw %%mm6, %%mm4 \n\t" - "punpckhbw %%mm3, %%mm1 \n\t" - "punpckhbw %%mm6, %%mm0 \n\t" - "movq %%mm5, %%mm3 \n\t" - "movq %%mm1, %%mm6 \n\t" - "punpcklwd %%mm4, %%mm5 \n\t" - "punpcklwd %%mm0, %%mm1 \n\t" - "punpckhwd %%mm4, %%mm3 \n\t" - "punpckhwd %%mm0, %%mm6 \n\t" - "movd %%mm5, (%0) \n\t" - "punpckhdq %%mm5, %%mm5 \n\t" - "movd %%mm5, (%0,%2) \n\t" - "movd %%mm3, (%0,%2,2) \n\t" - "punpckhdq %%mm3, %%mm3 \n\t" - "movd %%mm3, (%0,%3) \n\t" - "movd %%mm1, (%1) \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, (%1,%2) \n\t" - "movd %%mm6, (%1,%2,2) \n\t" - "punpckhdq %%mm6, %%mm6 \n\t" - "movd %%mm6, (%1,%3) \n\t" - :: "r" (src), - "r" (src + 4*stride), - "r" ((x86_reg) stride ), - "r" ((x86_reg)(3*stride)) - ); + __asm__ volatile ( + "movq %%mm5, %%mm1 \n\t" + "movq %%mm4, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm4 \n\t" + "punpckhbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm6, %%mm0 \n\t" + "movq %%mm5, %%mm3 \n\t" + "movq %%mm1, %%mm6 \n\t" + "punpcklwd %%mm4, %%mm5 \n\t" + "punpcklwd %%mm0, %%mm1 \n\t" + "punpckhwd %%mm4, %%mm3 \n\t" + "punpckhwd %%mm0, %%mm6 \n\t" + "movd %%mm5, (%0) \n\t" + "punpckhdq %%mm5, %%mm5 \n\t" + "movd %%mm5, (%0, %2) \n\t" + "movd %%mm3, (%0, %2, 2) \n\t" + "punpckhdq %%mm3, %%mm3 \n\t" + "movd %%mm3, (%0, %3) \n\t" + "movd %%mm1, (%1) \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, (%1, %2) \n\t" + "movd %%mm6, (%1, %2, 2) \n\t" + "punpckhdq %%mm6, %%mm6 \n\t" + "movd %%mm6, (%1, %3) \n\t" + :: "r"(src), + "r"(src + 4 * stride), + "r"((x86_reg)stride), + "r"((x86_reg)(3 * stride)) + ); } } @@ -788,411 +788,418 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, /* left and right */ ptr = buf; if (w == 8) { - __asm__ volatile( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) - ); + __asm__ volatile ( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "punpckldq %%mm0, %%mm0 \n\t" + "movq %%mm0, -8(%0) \n\t" + "movq -8(%0, %2), %%mm1 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm1, (%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) + ); } else { - __asm__ volatile( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq %%mm0, -16(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "movq %%mm1, 8(%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) - ); + __asm__ volatile ( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "punpckldq %%mm0, %%mm0 \n\t" + "movq %%mm0, -8(%0) \n\t" + "movq %%mm0, -16(%0) \n\t" + "movq -8(%0, %2), %%mm1 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm1, (%0, %2) \n\t" + "movq %%mm1, 8(%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) + ); } /* top and bottom (and hopefully also the corners) */ if (sides & EDGE_TOP) { for (i = 0; i < h; i += 4) { ptr = buf - (i + 1) * wrap - w; - __asm__ volatile( - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm0, (%0, %2) \n\t" - "movq %%mm0, (%0, %2, 2) \n\t" - "movq %%mm0, (%0, %3) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) - ); + __asm__ volatile ( + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm0, (%0, %2) \n\t" + "movq %%mm0, (%0, %2, 2) \n\t" + "movq %%mm0, (%0, %3) \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" + "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap), + "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w) + ); } } if (sides & EDGE_BOTTOM) { for (i = 0; i < h; i += 4) { ptr = last_line + (i + 1) * wrap - w; - __asm__ volatile( - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm0, (%0, %2) \n\t" - "movq %%mm0, (%0, %2, 2) \n\t" - "movq %%mm0, (%0, %3) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) - ); + __asm__ volatile ( + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm0, (%0, %2) \n\t" + "movq %%mm0, (%0, %2, 2) \n\t" + "movq %%mm0, (%0, %3) \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" + "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)last_line - (x86_reg)ptr - w), + "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3), + "r"(ptr + width + 2 * w) + ); } } } -#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ - "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ - "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ - "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ - "movq "#in7", " #m3 " \n\t" /* d */\ - "movq "#in0", %%mm5 \n\t" /* D */\ - "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ - "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ - "movq "#in1", %%mm5 \n\t" /* C */\ - "movq "#in2", %%mm6 \n\t" /* B */\ - "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ - "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ - "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ - "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ - "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ - "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ - "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ - "psraw $5, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm5 \n\t"\ - OP(%%mm5, out, %%mm7, d) - -#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \ -static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ - uint64_t temp; \ - \ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ - "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ - "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ - "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ - "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ - "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ - "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ - "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ - "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ - "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ - "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ - "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ - "paddw %%mm3, %%mm5 \n\t" /* b */\ - "paddw %%mm2, %%mm6 \n\t" /* c */\ - "paddw %%mm5, %%mm5 \n\t" /* 2b */\ - "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ - "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ - "paddw %%mm4, %%mm0 \n\t" /* a */\ - "paddw %%mm1, %%mm5 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ - "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ - "paddw %6, %%mm6 \n\t"\ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm0 \n\t"\ - "movq %%mm0, %5 \n\t"\ - /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ - \ - "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ - "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ - "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ - "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ - "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ - "paddw %%mm0, %%mm2 \n\t" /* b */\ - "paddw %%mm5, %%mm3 \n\t" /* c */\ - "paddw %%mm2, %%mm2 \n\t" /* 2b */\ - "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ - "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ - "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ - "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ - "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ - "paddw %%mm2, %%mm1 \n\t" /* a */\ - "paddw %%mm6, %%mm4 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ - "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ - "paddw %6, %%mm1 \n\t"\ - "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ - "psraw $5, %%mm3 \n\t"\ - "movq %5, %%mm1 \n\t"\ - "packuswb %%mm3, %%mm1 \n\t"\ - OP_MMX2(%%mm1, (%1),%%mm4, q)\ - /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ - \ - "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ - "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ - "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ - "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ - "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ - "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ - "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ - "paddw %%mm1, %%mm5 \n\t" /* b */\ - "paddw %%mm4, %%mm0 \n\t" /* c */\ - "paddw %%mm5, %%mm5 \n\t" /* 2b */\ - "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ - "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ - "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ - "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ - "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ - "paddw %%mm3, %%mm2 \n\t" /* d */\ - "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ - "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ - "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ - "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ - "paddw %%mm2, %%mm6 \n\t" /* a */\ - "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ - "paddw %6, %%mm0 \n\t"\ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm0 \n\t"\ - /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ - \ - "paddw %%mm5, %%mm3 \n\t" /* a */\ - "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ - "paddw %%mm4, %%mm6 \n\t" /* b */\ - "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ - "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ - "paddw %%mm1, %%mm4 \n\t" /* c */\ - "paddw %%mm2, %%mm5 \n\t" /* d */\ - "paddw %%mm6, %%mm6 \n\t" /* 2b */\ - "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ - "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ - "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ - "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ - "paddw %6, %%mm4 \n\t"\ - "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm4 \n\t"\ - "packuswb %%mm4, %%mm0 \n\t"\ - OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ - \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+D"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ - : "memory"\ - );\ -} \ - \ -static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ - int i; \ - int16_t temp[16]; \ - /* quick HACK, XXX FIXME MUST be optimized */ \ - for (i = 0; i < h; i++) { \ - temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \ - (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \ - temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \ - (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \ - temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \ - (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \ - temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \ - (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \ - temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \ - (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \ - temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \ - (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \ - temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \ - (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \ - temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \ - (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \ - temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \ - (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \ - temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \ - (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \ - temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \ - (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \ - temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \ - (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \ - temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \ - (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \ - temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \ - (src[11] + src[16]) * 3 - (src[10] + src[16]); \ - temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \ - (src[12] + src[16]) * 3 - (src[11] + src[15]); \ - temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \ - (src[13] + src[15]) * 3 - (src[12] + src[14]); \ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "paddw %2, %%mm0 \n\t"\ - "paddw %2, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP_3DNOW(%%mm0, (%1), %%mm1, q)\ - "movq 16(%0), %%mm0 \n\t"\ - "movq 24(%0), %%mm1 \n\t"\ - "paddw %2, %%mm0 \n\t"\ - "paddw %2, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ - :: "r"(temp), "r"(dst), "m"(ROUNDER)\ - : "memory"\ - );\ - dst += dstStride; \ - src += srcStride; \ - } \ -} \ - \ -static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ - "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ - "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ - "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ - "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ - "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ - "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ - "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ - "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ - "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ - "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ - "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ - "paddw %%mm3, %%mm5 \n\t" /* b */\ - "paddw %%mm2, %%mm6 \n\t" /* c */\ - "paddw %%mm5, %%mm5 \n\t" /* 2b */\ - "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ - "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ - "paddw %%mm4, %%mm0 \n\t" /* a */\ - "paddw %%mm1, %%mm5 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ - "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ - "paddw %5, %%mm6 \n\t"\ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm0 \n\t"\ - /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ - \ - "movd 5(%0), %%mm5 \n\t" /* FGHI */\ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ - "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ - "paddw %%mm5, %%mm1 \n\t" /* a */\ - "paddw %%mm6, %%mm2 \n\t" /* b */\ - "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ - "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ - "paddw %%mm6, %%mm3 \n\t" /* c */\ - "paddw %%mm5, %%mm4 \n\t" /* d */\ - "paddw %%mm2, %%mm2 \n\t" /* 2b */\ - "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ - "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ - "paddw %5, %%mm1 \n\t"\ - "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP_MMX2(%%mm0, (%1), %%mm4, q)\ - \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(h)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ - : "memory"\ - );\ -} \ - \ -static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ - int i; \ - int16_t temp[8]; \ - /* quick HACK, XXX FIXME MUST be optimized */ \ - for (i = 0; i < h; i++) { \ - temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \ - (src[1] + src[3]) * 3 - (src[2] + src[4]); \ - temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \ - (src[0] + src[4]) * 3 - (src[1] + src[5]); \ - temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \ - (src[0] + src[5]) * 3 - (src[0] + src[6]); \ - temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \ - (src[1] + src[6]) * 3 - (src[0] + src[7]); \ - temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \ - (src[2] + src[7]) * 3 - (src[1] + src[8]); \ - temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \ - (src[3] + src[8]) * 3 - (src[2] + src[8]); \ - temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \ - (src[4] + src[8]) * 3 - (src[3] + src[7]); \ - temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \ - (src[5] + src[7]) * 3 - (src[4] + src[6]); \ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "paddw %2, %%mm0 \n\t"\ - "paddw %2, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP_3DNOW(%%mm0, (%1), %%mm1, q)\ - :: "r"(temp), "r"(dst), "m"(ROUNDER)\ - :"memory"\ - );\ - dst += dstStride; \ - src += srcStride; \ - } \ +#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \ + in0, in1, in2, in7, out, OP) \ + "paddw "#m4", "#m3" \n\t" /* x1 */ \ + "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \ + "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \ + "movq "#in7", "#m3" \n\t" /* d */ \ + "movq "#in0", %%mm5 \n\t" /* D */ \ + "paddw "#m3", %%mm5 \n\t" /* x4 */ \ + "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \ + "movq "#in1", %%mm5 \n\t" /* C */ \ + "movq "#in2", %%mm6 \n\t" /* B */ \ + "paddw "#m6", %%mm5 \n\t" /* x3 */ \ + "paddw "#m5", %%mm6 \n\t" /* x2 */ \ + "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \ + "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \ + "paddw "#rnd", %%mm4 \n\t" /* x2 */ \ + "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \ + "psraw $5, %%mm5 \n\t" \ + "packuswb %%mm5, %%mm5 \n\t" \ + OP(%%mm5, out, %%mm7, d) + +#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \ +static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \ + uint8_t *src, \ + int dstStride, \ + int srcStride, \ + int h) \ +{ \ + uint64_t temp; \ + \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "1: \n\t" \ + "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \ + "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \ + "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \ + "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \ + "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \ + "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \ + "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \ + "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \ + "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \ + "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \ + "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \ + "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \ + "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \ + "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \ + "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \ + "paddw %%mm3, %%mm5 \n\t" /* b */ \ + "paddw %%mm2, %%mm6 \n\t" /* c */ \ + "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ + "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \ + "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \ + "paddw %%mm4, %%mm0 \n\t" /* a */ \ + "paddw %%mm1, %%mm5 \n\t" /* d */ \ + "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \ + "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \ + "paddw %6, %%mm6 \n\t" \ + "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ + "psraw $5, %%mm0 \n\t" \ + "movq %%mm0, %5 \n\t" \ + /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \ + \ + "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \ + "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \ + "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \ + "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \ + "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \ + "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \ + "paddw %%mm0, %%mm2 \n\t" /* b */ \ + "paddw %%mm5, %%mm3 \n\t" /* c */ \ + "paddw %%mm2, %%mm2 \n\t" /* 2b */ \ + "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \ + "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \ + "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \ + "paddw %%mm2, %%mm1 \n\t" /* a */ \ + "paddw %%mm6, %%mm4 \n\t" /* d */ \ + "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \ + "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \ + "paddw %6, %%mm1 \n\t" \ + "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \ + "psraw $5, %%mm3 \n\t" \ + "movq %5, %%mm1 \n\t" \ + "packuswb %%mm3, %%mm1 \n\t" \ + OP_MMX2(%%mm1, (%1), %%mm4, q) \ + /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \ + \ + "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \ + "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \ + "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \ + "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \ + "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \ + "paddw %%mm1, %%mm5 \n\t" /* b */ \ + "paddw %%mm4, %%mm0 \n\t" /* c */ \ + "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ + "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \ + "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \ + "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \ + "paddw %%mm3, %%mm2 \n\t" /* d */ \ + "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \ + "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \ + "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \ + "paddw %%mm2, %%mm6 \n\t" /* a */ \ + "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \ + "paddw %6, %%mm0 \n\t" \ + "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ + "psraw $5, %%mm0 \n\t" \ + /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \ + /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \ + \ + "paddw %%mm5, %%mm3 \n\t" /* a */ \ + "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \ + "paddw %%mm4, %%mm6 \n\t" /* b */ \ + "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \ + "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \ + "paddw %%mm1, %%mm4 \n\t" /* c */ \ + "paddw %%mm2, %%mm5 \n\t" /* d */ \ + "paddw %%mm6, %%mm6 \n\t" /* 2b */ \ + "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \ + "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \ + "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \ + "paddw %6, %%mm4 \n\t" \ + "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \ + "psraw $5, %%mm4 \n\t" \ + "packuswb %%mm4, %%mm0 \n\t" \ + OP_MMX2(%%mm0, 8(%1), %%mm4, q) \ + \ + "add %3, %0 \n\t" \ + "add %4, %1 \n\t" \ + "decl %2 \n\t" \ + "jnz 1b \n\t" \ + : "+a"(src), "+c"(dst), "+D"(h) \ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \ + /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \ + : "memory" \ + ); \ +} \ + \ +static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \ + uint8_t *src, \ + int dstStride, \ + int srcStride, \ + int h) \ +{ \ + int i; \ + int16_t temp[16]; \ + /* quick HACK, XXX FIXME MUST be optimized */ \ + for (i = 0; i < h; i++) { \ + temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \ + (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \ + temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \ + (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \ + temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \ + (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \ + temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \ + (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \ + temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \ + (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \ + temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \ + (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \ + temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \ + (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \ + temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \ + (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \ + temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \ + (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \ + temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \ + (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \ + temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \ + (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \ + temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \ + (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \ + temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \ + (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \ + temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \ + (src[11] + src[16]) * 3 - (src[10] + src[16]); \ + temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \ + (src[12] + src[16]) * 3 - (src[11] + src[15]); \ + temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \ + (src[13] + src[15]) * 3 - (src[12] + src[14]); \ + __asm__ volatile ( \ + "movq (%0), %%mm0 \n\t" \ + "movq 8(%0), %%mm1 \n\t" \ + "paddw %2, %%mm0 \n\t" \ + "paddw %2, %%mm1 \n\t" \ + "psraw $5, %%mm0 \n\t" \ + "psraw $5, %%mm1 \n\t" \ + "packuswb %%mm1, %%mm0 \n\t" \ + OP_3DNOW(%%mm0, (%1), %%mm1, q) \ + "movq 16(%0), %%mm0 \n\t" \ + "movq 24(%0), %%mm1 \n\t" \ + "paddw %2, %%mm0 \n\t" \ + "paddw %2, %%mm1 \n\t" \ + "psraw $5, %%mm0 \n\t" \ + "psraw $5, %%mm1 \n\t" \ + "packuswb %%mm1, %%mm0 \n\t" \ + OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \ + :: "r"(temp), "r"(dst), "m"(ROUNDER) \ + : "memory" \ + ); \ + dst += dstStride; \ + src += srcStride; \ + } \ +} \ + \ +static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \ + uint8_t *src, \ + int dstStride, \ + int srcStride, \ + int h) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "1: \n\t" \ + "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \ + "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \ + "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \ + "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \ + "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \ + "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \ + "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \ + "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \ + "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \ + "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \ + "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \ + "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \ + "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \ + "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \ + "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \ + "paddw %%mm3, %%mm5 \n\t" /* b */ \ + "paddw %%mm2, %%mm6 \n\t" /* c */ \ + "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ + "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \ + "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \ + "paddw %%mm4, %%mm0 \n\t" /* a */ \ + "paddw %%mm1, %%mm5 \n\t" /* d */ \ + "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \ + "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \ + "paddw %5, %%mm6 \n\t" \ + "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ + "psraw $5, %%mm0 \n\t" \ + /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \ + \ + "movd 5(%0), %%mm5 \n\t" /* FGHI */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \ + "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \ + "paddw %%mm5, %%mm1 \n\t" /* a */ \ + "paddw %%mm6, %%mm2 \n\t" /* b */ \ + "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \ + "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \ + "paddw %%mm6, %%mm3 \n\t" /* c */ \ + "paddw %%mm5, %%mm4 \n\t" /* d */ \ + "paddw %%mm2, %%mm2 \n\t" /* 2b */ \ + "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \ + "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \ + "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \ + "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \ + "paddw %5, %%mm1 \n\t" \ + "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \ + "psraw $5, %%mm3 \n\t" \ + "packuswb %%mm3, %%mm0 \n\t" \ + OP_MMX2(%%mm0, (%1), %%mm4, q) \ + \ + "add %3, %0 \n\t" \ + "add %4, %1 \n\t" \ + "decl %2 \n\t" \ + "jnz 1b \n\t" \ + : "+a"(src), "+c"(dst), "+d"(h) \ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \ + /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \ + : "memory" \ + ); \ +} \ + \ +static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \ + uint8_t *src, \ + int dstStride, \ + int srcStride, \ + int h) \ +{ \ + int i; \ + int16_t temp[8]; \ + /* quick HACK, XXX FIXME MUST be optimized */ \ + for (i = 0; i < h; i++) { \ + temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \ + (src[1] + src[3]) * 3 - (src[2] + src[4]); \ + temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \ + (src[0] + src[4]) * 3 - (src[1] + src[5]); \ + temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \ + (src[0] + src[5]) * 3 - (src[0] + src[6]); \ + temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \ + (src[1] + src[6]) * 3 - (src[0] + src[7]); \ + temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \ + (src[2] + src[7]) * 3 - (src[1] + src[8]); \ + temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \ + (src[3] + src[8]) * 3 - (src[2] + src[8]); \ + temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \ + (src[4] + src[8]) * 3 - (src[3] + src[7]); \ + temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \ + (src[5] + src[7]) * 3 - (src[4] + src[6]); \ + __asm__ volatile ( \ + "movq (%0), %%mm0 \n\t" \ + "movq 8(%0), %%mm1 \n\t" \ + "paddw %2, %%mm0 \n\t" \ + "paddw %2, %%mm1 \n\t" \ + "psraw $5, %%mm0 \n\t" \ + "psraw $5, %%mm1 \n\t" \ + "packuswb %%mm1, %%mm0 \n\t" \ + OP_3DNOW(%%mm0, (%1), %%mm1, q) \ + :: "r"(temp), "r"(dst), "m"(ROUNDER) \ + : "memory" \ + ); \ + dst += dstStride; \ + src += srcStride; \ + } \ } #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \ @@ -1205,77 +1212,79 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \ uint64_t *temp_ptr = temp; \ int count = 17; \ \ - /*FIXME unroll */\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq (%0), %%mm1 \n\t"\ - "movq 8(%0), %%mm2 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "movq %%mm0, (%1) \n\t"\ - "movq %%mm1, 17*8(%1) \n\t"\ - "movq %%mm2, 2*17*8(%1) \n\t"\ - "movq %%mm3, 3*17*8(%1) \n\t"\ - "add $8, %1 \n\t"\ - "add %3, %0 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+r" (src), "+r" (temp_ptr), "+r"(count)\ - : "r" ((x86_reg)srcStride)\ - : "memory"\ - );\ + /* FIXME unroll */ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "1: \n\t" \ + "movq (%0), %%mm0 \n\t" \ + "movq (%0), %%mm1 \n\t" \ + "movq 8(%0), %%mm2 \n\t" \ + "movq 8(%0), %%mm3 \n\t" \ + "punpcklbw %%mm7, %%mm0 \n\t" \ + "punpckhbw %%mm7, %%mm1 \n\t" \ + "punpcklbw %%mm7, %%mm2 \n\t" \ + "punpckhbw %%mm7, %%mm3 \n\t" \ + "movq %%mm0, (%1) \n\t" \ + "movq %%mm1, 17 * 8(%1) \n\t" \ + "movq %%mm2, 2 * 17 * 8(%1) \n\t" \ + "movq %%mm3, 3 * 17 * 8(%1) \n\t" \ + "add $8, %1 \n\t" \ + "add %3, %0 \n\t" \ + "decl %2 \n\t" \ + "jnz 1b \n\t" \ + : "+r"(src), "+r"(temp_ptr), "+r"(count) \ + : "r"((x86_reg)srcStride) \ + : "memory" \ + ); \ \ temp_ptr = temp; \ count = 4; \ \ -/*FIXME reorder for speed */\ - __asm__ volatile(\ - /*"pxor %%mm7, %%mm7 \n\t"*/\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "movq 16(%0), %%mm2 \n\t"\ - "movq 24(%0), %%mm3 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ - \ - "add $136, %0 \n\t"\ - "add %6, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ - : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ - :"memory"\ - );\ + /* FIXME reorder for speed */ \ + __asm__ volatile ( \ + /* "pxor %%mm7, %%mm7 \n\t" */ \ + "1: \n\t" \ + "movq (%0), %%mm0 \n\t" \ + "movq 8(%0), %%mm1 \n\t" \ + "movq 16(%0), %%mm2 \n\t" \ + "movq 24(%0), %%mm3 \n\t" \ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \ + \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \ + \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \ + \ + "add $136, %0 \n\t" \ + "add %6, %1 \n\t" \ + "decl %2 \n\t" \ + "jnz 1b \n\t" \ + \ + : "+r"(temp_ptr), "+r"(dst), "+g"(count) \ + : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \ + /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \ + "g"(4 - 14 * (x86_reg)dstStride) \ + : "memory" \ + ); \ } \ \ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \ @@ -1287,59 +1296,61 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \ uint64_t *temp_ptr = temp; \ int count = 9; \ \ - /*FIXME unroll */\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq (%0), %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "movq %%mm0, (%1) \n\t"\ - "movq %%mm1, 9*8(%1) \n\t"\ - "add $8, %1 \n\t"\ - "add %3, %0 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+r" (src), "+r" (temp_ptr), "+r"(count)\ - : "r" ((x86_reg)srcStride)\ - : "memory"\ - );\ + /* FIXME unroll */ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "1: \n\t" \ + "movq (%0), %%mm0 \n\t" \ + "movq (%0), %%mm1 \n\t" \ + "punpcklbw %%mm7, %%mm0 \n\t" \ + "punpckhbw %%mm7, %%mm1 \n\t" \ + "movq %%mm0, (%1) \n\t" \ + "movq %%mm1, 9*8(%1) \n\t" \ + "add $8, %1 \n\t" \ + "add %3, %0 \n\t" \ + "decl %2 \n\t" \ + "jnz 1b \n\t" \ + : "+r"(src), "+r"(temp_ptr), "+r"(count) \ + : "r"((x86_reg)srcStride) \ + : "memory" \ + ); \ \ temp_ptr = temp; \ count = 2; \ \ -/*FIXME reorder for speed */\ - __asm__ volatile(\ - /*"pxor %%mm7, %%mm7 \n\t"*/\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "movq 16(%0), %%mm2 \n\t"\ - "movq 24(%0), %%mm3 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ - \ - "add $72, %0 \n\t"\ - "add %6, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ - : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ - : "memory"\ - );\ + /* FIXME reorder for speed */ \ + __asm__ volatile ( \ + /* "pxor %%mm7, %%mm7 \n\t" */ \ + "1: \n\t" \ + "movq (%0), %%mm0 \n\t" \ + "movq 8(%0), %%mm1 \n\t" \ + "movq 16(%0), %%mm2 \n\t" \ + "movq 24(%0), %%mm3 \n\t" \ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \ + \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \ + \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \ + \ + "add $72, %0 \n\t" \ + "add %6, %1 \n\t" \ + "decl %2 \n\t" \ + "jnz 1b \n\t" \ + \ + : "+r"(temp_ptr), "+r"(dst), "+g"(count) \ + : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \ + /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \ + "g"(4 - 6 * (x86_reg)dstStride) \ + : "memory" \ + ); \ } \ \ static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ @@ -1696,25 +1707,28 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ } -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) -QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) -QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) -QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) -QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) -QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) -QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) -QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) -QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) +#define PUT_OP(a, b, temp, size) \ + "mov"#size" "#a", "#b" \n\t" + +#define AVG_3DNOW_OP(a, b, temp, size) \ + "mov"#size" "#b", "#temp" \n\t" \ + "pavgusb "#temp", "#a" \n\t" \ + "mov"#size" "#a", "#b" \n\t" + +#define AVG_MMX2_OP(a, b, temp, size) \ + "mov"#size" "#b", "#temp" \n\t" \ + "pavgb "#temp", "#a" \n\t" \ + "mov"#size" "#a", "#b" \n\t" + +QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP) +QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP) +QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) +QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow) +QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow) +QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) +QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2) +QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2) +QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) /***********************************/ /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ @@ -1881,7 +1895,7 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src, (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) // uses more than 16 bits of subpel mv (only at huge resolution) || (dxx | dxy | dyx | dyy) & 15) { - //FIXME could still use mmx for some of the rows + // FIXME could still use mmx for some of the rows ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); return; @@ -1894,11 +1908,11 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src, src = edge_buf; } - __asm__ volatile( - "movd %0, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" + __asm__ volatile ( + "movd %0, %%mm6 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" :: "r"(1<0) & (a ^ sign(m))) - "movq %%mm3, %1 \n\t" - "movq %%mm0, %0 \n\t" - :"+m"(mag[i]), "+m"(ang[i]) - ::"memory" + __asm__ volatile ( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 + "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 + "pslld $31, %%mm2 \n\t" // keep only the sign bit + "pxor %%mm2, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm1, %%mm3 \n\t" + "pandn %%mm1, %%mm4 \n\t" + "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) + "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) + "movq %%mm3, %1 \n\t" + "movq %%mm0, %0 \n\t" + : "+m"(mag[i]), "+m"(ang[i]) + :: "memory" ); } - __asm__ volatile("femms"); + __asm__ volatile ("femms"); } static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) { int i; - __asm__ volatile( - "movaps %0, %%xmm5 \n\t" - ::"m"(ff_pdw_80000000[0]) + __asm__ volatile ( + "movaps %0, %%xmm5 \n\t" + :: "m"(ff_pdw_80000000[0]) ); for (i = 0; i < blocksize; i += 4) { - __asm__ volatile( - "movaps %0, %%xmm0 \n\t" - "movaps %1, %%xmm1 \n\t" + __asm__ volatile ( + "movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" "xorps %%xmm2, %%xmm2 \n\t" "xorps %%xmm3, %%xmm3 \n\t" "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 @@ -2199,12 +2213,12 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) "movaps %%xmm3, %%xmm4 \n\t" "andps %%xmm1, %%xmm3 \n\t" "andnps %%xmm1, %%xmm4 \n\t" - "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) - "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) + "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) "movaps %%xmm3, %1 \n\t" "movaps %%xmm0, %0 \n\t" - :"+m"(mag[i]), "+m"(ang[i]) - ::"memory" + : "+m"(mag[i]), "+m"(ang[i]) + :: "memory" ); } } @@ -2212,67 +2226,68 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) #define IF1(x) x #define IF0(x) -#define MIX5(mono,stereo)\ - __asm__ volatile(\ - "movss 0(%2), %%xmm5 \n"\ - "movss 8(%2), %%xmm6 \n"\ - "movss 24(%2), %%xmm7 \n"\ - "shufps $0, %%xmm5, %%xmm5 \n"\ - "shufps $0, %%xmm6, %%xmm6 \n"\ - "shufps $0, %%xmm7, %%xmm7 \n"\ - "1: \n"\ - "movaps (%0,%1), %%xmm0 \n"\ - "movaps 0x400(%0,%1), %%xmm1 \n"\ - "movaps 0x800(%0,%1), %%xmm2 \n"\ - "movaps 0xc00(%0,%1), %%xmm3 \n"\ - "movaps 0x1000(%0,%1), %%xmm4 \n"\ - "mulps %%xmm5, %%xmm0 \n"\ - "mulps %%xmm6, %%xmm1 \n"\ - "mulps %%xmm5, %%xmm2 \n"\ - "mulps %%xmm7, %%xmm3 \n"\ - "mulps %%xmm7, %%xmm4 \n"\ - stereo("addps %%xmm1, %%xmm0 \n")\ - "addps %%xmm1, %%xmm2 \n"\ - "addps %%xmm3, %%xmm0 \n"\ - "addps %%xmm4, %%xmm2 \n"\ - mono("addps %%xmm2, %%xmm0 \n")\ - "movaps %%xmm0, (%0,%1) \n"\ - stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ - "add $16, %0 \n"\ - "jl 1b \n"\ - :"+&r"(i)\ - :"r"(samples[0]+len), "r"(matrix)\ - :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ +#define MIX5(mono, stereo) \ + __asm__ volatile ( \ + "movss 0(%2), %%xmm5 \n" \ + "movss 8(%2), %%xmm6 \n" \ + "movss 24(%2), %%xmm7 \n" \ + "shufps $0, %%xmm5, %%xmm5 \n" \ + "shufps $0, %%xmm6, %%xmm6 \n" \ + "shufps $0, %%xmm7, %%xmm7 \n" \ + "1: \n" \ + "movaps (%0, %1), %%xmm0 \n" \ + "movaps 0x400(%0, %1), %%xmm1 \n" \ + "movaps 0x800(%0, %1), %%xmm2 \n" \ + "movaps 0xc00(%0, %1), %%xmm3 \n" \ + "movaps 0x1000(%0, %1), %%xmm4 \n" \ + "mulps %%xmm5, %%xmm0 \n" \ + "mulps %%xmm6, %%xmm1 \n" \ + "mulps %%xmm5, %%xmm2 \n" \ + "mulps %%xmm7, %%xmm3 \n" \ + "mulps %%xmm7, %%xmm4 \n" \ + stereo("addps %%xmm1, %%xmm0 \n") \ + "addps %%xmm1, %%xmm2 \n" \ + "addps %%xmm3, %%xmm0 \n" \ + "addps %%xmm4, %%xmm2 \n" \ + mono("addps %%xmm2, %%xmm0 \n") \ + "movaps %%xmm0, (%0, %1) \n" \ + stereo("movaps %%xmm2, 0x400(%0, %1) \n") \ + "add $16, %0 \n" \ + "jl 1b \n" \ + : "+&r"(i) \ + : "r"(samples[0] + len), "r"(matrix) \ + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ + "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ + "memory" \ ); -#define MIX_MISC(stereo)\ - __asm__ volatile(\ - "1: \n"\ - "movaps (%3,%0), %%xmm0 \n"\ - stereo("movaps %%xmm0, %%xmm1 \n")\ - "mulps %%xmm4, %%xmm0 \n"\ - stereo("mulps %%xmm5, %%xmm1 \n")\ - "lea 1024(%3,%0), %1 \n"\ - "mov %5, %2 \n"\ - "2: \n"\ - "movaps (%1), %%xmm2 \n"\ - stereo("movaps %%xmm2, %%xmm3 \n")\ - "mulps (%4,%2), %%xmm2 \n"\ - stereo("mulps 16(%4,%2), %%xmm3 \n")\ - "addps %%xmm2, %%xmm0 \n"\ - stereo("addps %%xmm3, %%xmm1 \n")\ - "add $1024, %1 \n"\ - "add $32, %2 \n"\ - "jl 2b \n"\ - "movaps %%xmm0, (%3,%0) \n"\ - stereo("movaps %%xmm1, 1024(%3,%0) \n")\ - "add $16, %0 \n"\ - "jl 1b \n"\ - :"+&r"(i), "=&r"(j), "=&r"(k)\ - :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ - :"memory"\ +#define MIX_MISC(stereo) \ + __asm__ volatile ( \ + "1: \n" \ + "movaps (%3, %0), %%xmm0 \n" \ + stereo("movaps %%xmm0, %%xmm1 \n") \ + "mulps %%xmm4, %%xmm0 \n" \ + stereo("mulps %%xmm5, %%xmm1 \n") \ + "lea 1024(%3, %0), %1 \n" \ + "mov %5, %2 \n" \ + "2: \n" \ + "movaps (%1), %%xmm2 \n" \ + stereo("movaps %%xmm2, %%xmm3 \n") \ + "mulps (%4, %2), %%xmm2 \n" \ + stereo("mulps 16(%4, %2), %%xmm3 \n") \ + "addps %%xmm2, %%xmm0 \n" \ + stereo("addps %%xmm3, %%xmm1 \n") \ + "add $1024, %1 \n" \ + "add $32, %2 \n" \ + "jl 2b \n" \ + "movaps %%xmm0, (%3, %0) \n" \ + stereo("movaps %%xmm1, 1024(%3, %0) \n") \ + "add $16, %0 \n" \ + "jl 1b \n" \ + : "+&r"(i), "=&r"(j), "=&r"(k) \ + : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \ + "g"((intptr_t) - 32 * (in_ch - 1)) \ + : "memory" \ ); static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], @@ -2295,19 +2310,19 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], } else { DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; j = 2 * in_ch * sizeof(float); - __asm__ volatile( - "1: \n" - "sub $8, %0 \n" - "movss (%2,%0), %%xmm4 \n" - "movss 4(%2,%0), %%xmm5 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "shufps $0, %%xmm5, %%xmm5 \n" - "movaps %%xmm4, (%1,%0,4) \n" - "movaps %%xmm5, 16(%1,%0,4) \n" - "jg 1b \n" - :"+&r"(j) - :"r"(matrix_simd), "r"(matrix) - :"memory" + __asm__ volatile ( + "1: \n" + "sub $8, %0 \n" + "movss (%2, %0), %%xmm4 \n" + "movss 4(%2, %0), %%xmm5 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "shufps $0, %%xmm5, %%xmm5 \n" + "movaps %%xmm4, (%1, %0, 4) \n" + "movaps %%xmm5, 16(%1, %0, 4) \n" + "jg 1b \n" + : "+&r"(j) + : "r"(matrix_simd), "r"(matrix) + : "memory" ); if (out_ch == 2) { MIX_MISC(IF1); @@ -2321,20 +2336,20 @@ static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len) { x86_reg i = (len - 4) * 4; - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "movq %%mm0, (%1,%0) \n\t" - "movq %%mm1, 8(%1,%0) \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - "femms \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1) - :"memory" + __asm__ volatile ( + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "pfmul (%3, %0), %%mm0 \n\t" + "pfmul 8(%3, %0), %%mm1 \n\t" + "movq %%mm0, (%1, %0) \n\t" + "movq %%mm1, 8(%1, %0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + "femms \n\t" + : "+r"(i) + : "r"(dst), "r"(src0), "r"(src1) + : "memory" ); } @@ -2342,19 +2357,19 @@ static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len) { x86_reg i = (len - 8) * 4; - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%1,%0) \n\t" - "movaps %%xmm1, 16(%1,%0) \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1) - :"memory" + __asm__ volatile ( + "1: \n\t" + "movaps (%2, %0), %%xmm0 \n\t" + "movaps 16(%2, %0), %%xmm1 \n\t" + "mulps (%3, %0), %%xmm0 \n\t" + "mulps 16(%3, %0), %%xmm1 \n\t" + "movaps %%xmm0, (%1, %0) \n\t" + "movaps %%xmm1, 16(%1, %0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + : "+r"(i) + : "r"(dst), "r"(src0), "r"(src1) + : "memory" ); } @@ -2362,42 +2377,42 @@ static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len) { x86_reg i = len * 4 - 16; - __asm__ volatile( - "1: \n\t" - "pswapd 8(%1), %%mm0 \n\t" - "pswapd (%1), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "movq %%mm0, (%2,%0) \n\t" - "movq %%mm1, 8(%2,%0) \n\t" - "add $16, %1 \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(src1) - :"r"(dst), "r"(src0) + __asm__ volatile ( + "1: \n\t" + "pswapd 8(%1), %%mm0 \n\t" + "pswapd (%1), %%mm1 \n\t" + "pfmul (%3, %0), %%mm0 \n\t" + "pfmul 8(%3, %0), %%mm1 \n\t" + "movq %%mm0, (%2, %0) \n\t" + "movq %%mm1, 8(%2, %0) \n\t" + "add $16, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + : "+r"(i), "+r"(src1) + : "r"(dst), "r"(src0) ); - __asm__ volatile("femms"); + __asm__ volatile ("femms"); } static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len) { x86_reg i = len * 4 - 32; - __asm__ volatile( - "1: \n\t" - "movaps 16(%1), %%xmm0 \n\t" - "movaps (%1), %%xmm1 \n\t" - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" - "shufps $0x1b, %%xmm1, %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%2,%0) \n\t" - "movaps %%xmm1, 16(%2,%0) \n\t" - "add $32, %1 \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(src1) - :"r"(dst), "r"(src0) + __asm__ volatile ( + "1: \n\t" + "movaps 16(%1), %%xmm0 \n\t" + "movaps (%1), %%xmm1 \n\t" + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" + "shufps $0x1b, %%xmm1, %%xmm1 \n\t" + "mulps (%3, %0), %%xmm0 \n\t" + "mulps 16(%3, %0), %%xmm1 \n\t" + "movaps %%xmm0, (%2, %0) \n\t" + "movaps %%xmm1, 16(%2, %0) \n\t" + "add $32, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + : "+r"(i), "+r"(src1) + : "r"(dst), "r"(src0) ); } @@ -2405,44 +2420,44 @@ static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, const float *src2, int len) { x86_reg i = (len - 4) * 4; - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "pfadd (%4,%0), %%mm0 \n\t" - "pfadd 8(%4,%0), %%mm1 \n\t" - "movq %%mm0, (%1,%0) \n\t" - "movq %%mm1, 8(%1,%0) \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1), "r"(src2) - :"memory" + __asm__ volatile ( + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "pfmul (%3, %0), %%mm0 \n\t" + "pfmul 8(%3, %0), %%mm1 \n\t" + "pfadd (%4, %0), %%mm0 \n\t" + "pfadd 8(%4, %0), %%mm1 \n\t" + "movq %%mm0, (%1, %0) \n\t" + "movq %%mm1, 8(%1, %0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + : "+r"(i) + : "r"(dst), "r"(src0), "r"(src1), "r"(src2) + : "memory" ); - __asm__ volatile("femms"); + __asm__ volatile ("femms"); } static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, const float *src2, int len) { x86_reg i = (len - 8) * 4; - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "addps (%4,%0), %%xmm0 \n\t" - "addps 16(%4,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%1,%0) \n\t" - "movaps %%xmm1, 16(%1,%0) \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1), "r"(src2) - :"memory" + __asm__ volatile ( + "1: \n\t" + "movaps (%2, %0), %%xmm0 \n\t" + "movaps 16(%2, %0), %%xmm1 \n\t" + "mulps (%3, %0), %%xmm0 \n\t" + "mulps 16(%3, %0), %%xmm1 \n\t" + "addps (%4, %0), %%xmm0 \n\t" + "addps 16(%4, %0), %%xmm1 \n\t" + "movaps %%xmm0, (%1, %0) \n\t" + "movaps %%xmm1, 16(%1, %0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + : "+r"(i) + : "r"(dst), "r"(src0), "r"(src1), "r"(src2) + : "memory" ); } @@ -2453,29 +2468,29 @@ static void vector_fmul_window_3dnow2(float *dst, const float *src0, { x86_reg i = -len * 4; x86_reg j = len * 4 - 8; - __asm__ volatile( - "1: \n" - "pswapd (%5,%1), %%mm1 \n" - "movq (%5,%0), %%mm0 \n" - "pswapd (%4,%1), %%mm5 \n" - "movq (%3,%0), %%mm4 \n" - "movq %%mm0, %%mm2 \n" - "movq %%mm1, %%mm3 \n" - "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] - "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] - "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] - "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] - "pfadd %%mm3, %%mm2 \n" - "pfsub %%mm0, %%mm1 \n" - "pswapd %%mm2, %%mm2 \n" - "movq %%mm1, (%2,%0) \n" - "movq %%mm2, (%2,%1) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - "femms \n" - :"+r"(i), "+r"(j) - :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) + __asm__ volatile ( + "1: \n" + "pswapd (%5, %1), %%mm1 \n" + "movq (%5, %0), %%mm0 \n" + "pswapd (%4, %1), %%mm5 \n" + "movq (%3, %0), %%mm4 \n" + "movq %%mm0, %%mm2 \n" + "movq %%mm1, %%mm3 \n" + "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i] + "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j] + "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j] + "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i] + "pfadd %%mm3, %%mm2 \n" + "pfsub %%mm0, %%mm1 \n" + "pswapd %%mm2, %%mm2 \n" + "movq %%mm1, (%2, %0) \n" + "movq %%mm2, (%2, %1) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + "femms \n" + : "+r"(i), "+r"(j) + : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) ); } @@ -2484,30 +2499,30 @@ static void vector_fmul_window_sse(float *dst, const float *src0, { x86_reg i = -len * 4; x86_reg j = len * 4 - 16; - __asm__ volatile( - "1: \n" - "movaps (%5,%1), %%xmm1 \n" - "movaps (%5,%0), %%xmm0 \n" - "movaps (%4,%1), %%xmm5 \n" - "movaps (%3,%0), %%xmm4 \n" - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "shufps $0x1b, %%xmm5, %%xmm5 \n" - "movaps %%xmm0, %%xmm2 \n" - "movaps %%xmm1, %%xmm3 \n" - "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] - "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] - "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] - "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] - "addps %%xmm3, %%xmm2 \n" - "subps %%xmm0, %%xmm1 \n" - "shufps $0x1b, %%xmm2, %%xmm2 \n" - "movaps %%xmm1, (%2,%0) \n" - "movaps %%xmm2, (%2,%1) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - :"+r"(i), "+r"(j) - :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) + __asm__ volatile ( + "1: \n" + "movaps (%5, %1), %%xmm1 \n" + "movaps (%5, %0), %%xmm0 \n" + "movaps (%4, %1), %%xmm5 \n" + "movaps (%3, %0), %%xmm4 \n" + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "shufps $0x1b, %%xmm5, %%xmm5 \n" + "movaps %%xmm0, %%xmm2 \n" + "movaps %%xmm1, %%xmm3 \n" + "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i] + "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j] + "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j] + "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i] + "addps %%xmm3, %%xmm2 \n" + "subps %%xmm0, %%xmm1 \n" + "shufps $0x1b, %%xmm2, %%xmm2 \n" + "movaps %%xmm1, (%2, %0) \n" + "movaps %%xmm2, (%2, %1) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + : "+r"(i), "+r"(j) + : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) ); } #endif /* HAVE_6REGS */ @@ -2516,33 +2531,33 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max, int len) { x86_reg i = (len - 16) * 4; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "movss %4, %%xmm5 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "shufps $0, %%xmm5, %%xmm5 \n" - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel - "movaps 16(%2,%0), %%xmm1 \n\t" - "movaps 32(%2,%0), %%xmm2 \n\t" - "movaps 48(%2,%0), %%xmm3 \n\t" - "maxps %%xmm4, %%xmm0 \n\t" - "maxps %%xmm4, %%xmm1 \n\t" - "maxps %%xmm4, %%xmm2 \n\t" - "maxps %%xmm4, %%xmm3 \n\t" - "minps %%xmm5, %%xmm0 \n\t" - "minps %%xmm5, %%xmm1 \n\t" - "minps %%xmm5, %%xmm2 \n\t" - "minps %%xmm5, %%xmm3 \n\t" - "movaps %%xmm0, (%1,%0) \n\t" - "movaps %%xmm1, 16(%1,%0) \n\t" - "movaps %%xmm2, 32(%1,%0) \n\t" - "movaps %%xmm3, 48(%1,%0) \n\t" - "sub $64, %0 \n\t" - "jge 1b \n\t" - :"+&r"(i) - :"r"(dst), "r"(src), "m"(min), "m"(max) - :"memory" + __asm__ volatile ( + "movss %3, %%xmm4 \n\t" + "movss %4, %%xmm5 \n\t" + "shufps $0, %%xmm4, %%xmm4 \n\t" + "shufps $0, %%xmm5, %%xmm5 \n\t" + "1: \n\t" + "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel + "movaps 16(%2, %0), %%xmm1 \n\t" + "movaps 32(%2, %0), %%xmm2 \n\t" + "movaps 48(%2, %0), %%xmm3 \n\t" + "maxps %%xmm4, %%xmm0 \n\t" + "maxps %%xmm4, %%xmm1 \n\t" + "maxps %%xmm4, %%xmm2 \n\t" + "maxps %%xmm4, %%xmm3 \n\t" + "minps %%xmm5, %%xmm0 \n\t" + "minps %%xmm5, %%xmm1 \n\t" + "minps %%xmm5, %%xmm2 \n\t" + "minps %%xmm5, %%xmm3 \n\t" + "movaps %%xmm0, (%1, %0) \n\t" + "movaps %%xmm1, 16(%1, %0) \n\t" + "movaps %%xmm2, 32(%1, %0) \n\t" + "movaps %%xmm3, 48(%1, %0) \n\t" + "sub $64, %0 \n\t" + "jge 1b \n\t" + : "+&r"(i) + : "r"(dst), "r"(src), "m"(min), "m"(max) + : "memory" ); }