Permalink
Browse files

Complete SIMD support for 10 and 12 bit.

  • Loading branch information...
1 parent 1e8ec51 commit acb2fd3d6fd572db537bd8e7401538a94ebc9301 Arild Fuldseth (arilfuld) committed with Thomas Davies Nov 7, 2016
View
Oops, something went wrong.
@@ -30,8 +30,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdint.h>
void TEMPLATE(block_avg_simd)(SAMPLE *p,SAMPLE *r0, SAMPLE *r1, int sp, int s0, int s1, int width, int height);
int TEMPLATE(sad_calc_simd_unaligned)(SAMPLE *a, SAMPLE *b, int astride, int bstride, int width, int height);
-void TEMPLATE(get_inter_prediction_luma_simd)(int width, int height, int xoff, int yoff, SAMPLE *restrict qp, int qstride, const SAMPLE *restrict ip, int istride, int bipred);
-void TEMPLATE(get_inter_prediction_chroma_simd)(int width, int height, int xoff, int yoff, SAMPLE *restrict qp, int qstride, const SAMPLE *restrict ip, int istride);
+void TEMPLATE(get_inter_prediction_luma_simd)(int width, int height, int xoff, int yoff, SAMPLE *restrict qp, int qstride, const SAMPLE *restrict ip, int istride, int bipred, int bitdepth);
+void TEMPLATE(get_inter_prediction_chroma_simd)(int width, int height, int xoff, int yoff, SAMPLE *restrict qp, int qstride, const SAMPLE *restrict ip, int istride, int bitdepth);
void transform_simd(const int16_t *block, int16_t *coeff, int size, int fast, int bitdepth);
void inverse_transform_simd(const int16_t *coeff, int16_t *block, int size, int bitdepth);
void TEMPLATE(clpf_block4)(const SAMPLE *src, SAMPLE *dst, int stride, int x0, int y0, int width, int height, unsigned int strength);
Oops, something went wrong.
@@ -110,12 +110,8 @@ static void get_inter_prediction_chroma(SAMPLE *pblock, SAMPLE *ref, int width,
return;
}
-#ifndef HBD
if (use_simd && width > 2)
- get_inter_prediction_chroma_simd_lbd(width, height, hor_frac, ver_frac, pblock, pstride, ref + ver_int*stride + hor_int, stride);
-#else
- if (0) {} // TODO: HBD SIMD
-#endif
+ TEMPLATE(get_inter_prediction_chroma_simd)(width, height, hor_frac, ver_frac, pblock, pstride, ref + ver_int*stride + hor_int, stride, bitdepth);
else {
/* Horizontal filtering */
for(i=-1;i<height+2;i++){
@@ -165,13 +161,9 @@ void TEMPLATE(get_inter_prediction_luma)(SAMPLE *pblock, SAMPLE *ref, int width,
return;
}
-#ifndef HBD
if (use_simd)
- get_inter_prediction_luma_simd_lbd(width, height, hor_frac, ver_frac, pblock, pstride, ref + ver_int*stride + hor_int, stride, bipred);
+ TEMPLATE(get_inter_prediction_luma_simd)(width, height, hor_frac, ver_frac, pblock, pstride, ref + ver_int*stride + hor_int, stride, bipred, bitdepth);
/* Special lowpass filter at center position */
-#else
- if (0) {} // TODO: HBD SIMD
-#endif
else if (ver_frac == 2 && hor_frac == 2 && bipred < 2) {
for(i=0;i<height;i++){
for (j=0;j<width;j++){
View
@@ -78,7 +78,7 @@ SIMD_INLINE void thor_free(void *p)
free(((void**)p)[-1]);
}
-#elif __GNUC__
+#elif defined(__GNUC__) && !defined(__clang__)
#include <alloca.h>
SIMD_INLINE unsigned int log2i(uint32_t x)
@@ -74,6 +74,7 @@ SIMD_INLINE ssd128_internal_u16 v128_ssd_u16_init() { return c_v128_ssd_u16_init
SIMD_INLINE ssd128_internal_u16 v128_ssd_u16(ssd128_internal_u16 s, v128 a, v128 b) { return c_v128_ssd_u16(s, a, b); }
SIMD_INLINE uint32_t v128_ssd_u16_sum(ssd128_internal_u16 s) { return c_v128_ssd_u16_sum(s); }
SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { return c_v128_dotp_s16(a, b); }
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { return c_v128_dotp_s32(a, b); }
SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
@@ -121,6 +121,11 @@ SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
}
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+ int64x2_t t = vpaddlq_s32(vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+ return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+}
+
SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
return vget_lane_s32(vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
@@ -696,19 +701,17 @@ SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
return v128_zero();
}
-/* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum().
- The result for more than 16 v128_sad_u16() for 12 bit input calls is undefined. */
+/* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum(). */
SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, v128 b) {
- return v128_add_16(s, v128_abs_s16(v128_sub_16(a, b)));
+ return v128_add_32(s, v128_padd_s16(v128_abs_s16(v128_sub_16(a, b))));
}
SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
- v128 t = v128_padd_s16(s);
return
- v128_low_u32(t) +
- v128_low_u32(v128_shr_n_byte(t, 4)) +
- v128_low_u32(v128_shr_n_byte(t, 8)) +
- v128_low_u32(v128_shr_n_byte(t, 12));
+ v128_low_u32(s) +
+ v128_low_u32(v128_shr_n_byte(s, 4)) +
+ v128_low_u32(v128_shr_n_byte(s, 8)) +
+ v128_low_u32(v128_shr_n_byte(s, 12));
}
typedef v128 ssd128_internal_u16;
@@ -152,6 +152,12 @@ SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
return c_v64_dotp_s16(a.v64[1], b.v64[1]) + c_v64_dotp_s16(a.v64[0], b.v64[0]);
}
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+ return
+ (int64_t)(a.s32[3] * b.s32[3]) + (int64_t)(a.s32[2] * b.s32[2]) +
+ (int64_t)(a.s32[1] * b.s32[1]) + (int64_t)(a.s32[0] * b.s32[0]);
+}
+
SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
}
@@ -754,8 +760,7 @@ SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() {
return 0;
}
-/* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum().
- The result for more than 16 v128_sad_u16() for 12 bit input calls is undefined. */
+/* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum(). */
SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, c_v128 a, c_v128 b) {
int c;
for (c = 0; c < 8; c++)
@@ -441,6 +441,14 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
#endif
}
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+ v128 r = v128_mullo_s32(a, b);
+ return (int64_t)_mm_cvtsi128_si32(r) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
return _mm_madd_epi16(a, b);
}
@@ -619,19 +627,17 @@ SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
return v128_zero();
}
-/* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum().
- The result for more than 16 v128_sad_u16() for 12 bit input calls is undefined. */
+/* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum(). */
SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, v128 b) {
- return v128_add_16(s, v128_abs_s16(v128_sub_16(a, b)));
+ return v128_add_32(s, v128_padd_s16(v128_abs_s16(v128_sub_16(a, b))));
}
SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
- v128 t = v128_padd_s16(s);
- return
- v128_low_u32(t) +
- v128_low_u32(v128_shr_n_byte(t, 4)) +
- v128_low_u32(v128_shr_n_byte(t, 8)) +
- v128_low_u32(v128_shr_n_byte(t, 12));
+ return
+ v128_low_u32(s) +
+ v128_low_u32(v128_shr_n_byte(s, 4)) +
+ v128_low_u32(v128_shr_n_byte(s, 8)) +
+ v128_low_u32(v128_shr_n_byte(s, 12));
}
typedef v128 ssd128_internal_u16;
@@ -107,6 +107,9 @@ SIMD_INLINE uint32_t v256_ssd_u16_sum(ssd256_internal_u16 s) {
SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
return c_v256_dotp_s16(a, b);
}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+ return c_v256_dotp_s32(a, b);
+}
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
@@ -149,6 +149,11 @@ SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
c_v128_dotp_s16(a.v128[0], b.v128[0]);
}
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+ return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+ c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
}
@@ -112,6 +112,10 @@ SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+ return v128_dotp_s32(a.hi, b.hi) + v128_dotp_s32(a.lo, b.lo);
+}
+
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
}
@@ -332,6 +332,29 @@ SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
#endif
}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+ v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+ v128 t;
+ r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+ _mm256_cvtepi32_epi64(v256_low_v128(r)));
+ t = v256_low_v128(_mm256_add_epi64(r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+ return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+ v128 l = v256_low_v128(r);
+ v128 h = v256_high_v128(r);
+ return (int64_t)_mm_cvtsi128_si32(l) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+ (int64_t)_mm_cvtsi128_si32(h) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
v128 lo = v256_low_v128(t);
@@ -47,6 +47,7 @@ SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { return c_v64_from_16(a, b, c, d); }
+SIMD_INLINE uint32_t u32_zero() { return c_u32_zero(); }
SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { return c_u32_load_unaligned(p); }
SIMD_INLINE uint32_t u32_load_aligned(const void *p) { return c_u32_load_aligned(p); }
SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { c_u32_store_unaligned(p, a); }
@@ -67,6 +67,8 @@ SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
+SIMD_INLINE uint32_t u32_zero() { return 0; }
+
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
return *((uint32_t *)p);
}
@@ -104,6 +104,8 @@ SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d)
+SIMD_INLINE uint32_t c_u32_zero() { return 0; }
+
SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
uint32_t t;
uint8_t *pp = (uint8_t*)p;
@@ -72,6 +72,7 @@ SIMD_INLINE uint64_t v64_u64(v64 x) {
}
+SIMD_INLINE uint32_t u32_zero() { return 0; }
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
return *((uint32_t*)p);
@@ -959,7 +959,7 @@ unsigned int TEMPLATE(sad_calc_fastquarter_simd)(const SAMPLE *po, const SAMPLE
}
#ifndef HBD
-int calc_cbp_simd(int16_t *block, int size, int threshold) {
+int calc_cbp_simd(int32_t *block, int size, int threshold) {
int cbp = 0;
if (size ==8 ) {
v256 thr = v256_dup_32(threshold);
View
@@ -59,7 +59,7 @@ s/v128_mullo_s16/v256_mullo_s32/g
s/v128_mulhi_s16/v256_mulhi_s32/g
s/v128_mullo_s32/v256_mullo_s64/g
s/v128_madd_s16/v256_madd_s32/g
-s/v128_madd_us8/v256_madd_us16/g
+s/v128_madd_us8/v256_madd_s16/g
s/v128_avg_u8/v256_avg_u16/g
s/v128_rdavg_u8/v256_rdavg_u16/g
s/v128_avg_u16/v256_avg_u32/g
@@ -240,5 +240,23 @@ s/u32_load_unaligned/v64_load_unaligned/g
s/u32_load_aligned/v64_load_aligned/g
s/u32_store_unaligned/v64_store_unaligned/g
s/u32_store_aligned/v64_store_aligned/g
+s/u32_zero/v64_zero/g
s/uint32_t/v64/g
' $dst
+
+sed -i '
+s/uint16_t/uint32_t/g
+s/int16_t/int32_t/g
+' $dst
+
+sed -i '
+s/quote64_/v64_/g
+s/quote128_/v128_/g
+s/quote256_/v256_/g
+' $dst
+
+sed -i '
+s/quote64 /v64 /g
+s/quote128 /v128 /g
+s/quote256 /v256 /g
+' $dst

0 comments on commit acb2fd3

Please sign in to comment.