Permalink
Browse files

Finish up the port:

	. Note, that it works on CPUs with at least SSSE3
	  instruction set -- the original code assumes SSE4,
	  but that was relatively easy to patch
	. Do not attempt to build, if SSSE3 not among CPU-options
	. Fix up formatting warnings (reported upstream)
	. Fix a crash on i386. Unfortunately, 5 of the 40
	  self-tests still fail on i386 -- the problem reported
	  upstream
  • Loading branch information...
UnitedMarsupials authored and danielrh committed Jul 17, 2016
1 parent acfb6ae commit c5fd50554f5942efefd38407978fa5763de5e4ad
Showing with 106 additions and 41 deletions.
  1. +1 −0 AUTHORS
  2. +28 −14 src/lepton/idct.cc
  3. +1 −1 src/lepton/recoder.cc
  4. +26 −21 src/lepton/vp8_encoder.cc
  5. +1 −3 src/vp8/model/model.hh
  6. +2 −2 src/vp8/model/numeric.hh
  7. +47 −0 src/vp8/util/mm_mullo_epi32.hh
View
@@ -4,3 +4,4 @@ Keith Winstein
Chris Lesniewski
Mario Brito
Matthias Stirner
Mikhail T
View
@@ -1,8 +1,7 @@
/* -*-mode:c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
#include <emmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>
#include "../vp8/util/aligned_block.hh"
#include "../vp8/util/mm_mullo_epi32.hh"
namespace idct_local{
enum {
@@ -23,7 +22,10 @@ enum {
r2 = 181 // 256/sqrt(2)
};
}
void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) {
#ifndef __SSE2__
static void
idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) {
int32_t intermed[64];
using namespace idct_local;
// Horizontal 1-D IDCT.
@@ -149,6 +151,8 @@ void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[6
//outp[i]>>=3;
}
}
#else /* At least SSE2 is available { */
template<int which_vec, int offset, int stride> __m128i vget_raster(const AlignedBlock&block) {
return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset),
block.coefficients_raster(which_vec + 2 * stride + offset),
@@ -162,8 +166,8 @@ template<int offset, int stride> __m128i vquantize(int which_vec, __m128i vec, c
q[which_vec + offset]));
}
__m128i epi32l_to_epi16(__m128i lowvec) {
static __m128i
epi32l_to_epi16(__m128i lowvec) {
return _mm_shuffle_epi8(lowvec, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
}
@@ -181,9 +185,8 @@ __m128i epi32l_to_epi16(__m128i lowvec) {
}while(0)
void idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
static void
idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
char vintermed_storage[64 * sizeof(int32_t) + 16];
// align intermediate storage to 16 bytes
@@ -202,7 +205,12 @@ void idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64]
xv6 = vget_raster<0, 5, 8>(block);
xv7 = vget_raster<0, 3, 8>(block);
if (__builtin_expect(ignore_dc, true)) {
#ifdef __SSE4_1__
xv0 = _mm_insert_epi32(xv0, 0, 0);
#else
// See http://stackoverflow.com/questions/38384520/is-there-a-sse2-equivalent-for-mm-insert-epi32
xv0 = _mm_and_si128(xv0, _mm_set_epi32(-1,-1,-1, 0));
#endif
}
} else {
xv0 = vget_raster<32, 0, 8>(block);
@@ -378,7 +386,8 @@ __m128i m256_to_epi16(__m256i vec) {
}*/
#if __AVX2__
void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
static void
idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
// align intermediate storage to 16 bytes
using namespace idct_local;
// Horizontal 1-D IDCT.
@@ -589,11 +598,16 @@ void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64]
#endif
}
}
#else
void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
idct_sse(block, q, voutp, ignore_dc);
}
#endif
void idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
#endif /* } SSE2 or higher is available */
void
idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
#ifdef __AVX2__
idct_avx(block, q, voutp, ignore_dc);
#elif __SSE2__
idct_sse(block, q, voutp, ignore_dc);
#else
idct_scalar(block, q, voutp, ignore_dc);
#endif
}
View
@@ -98,7 +98,7 @@ int find_aligned_end_64(const int16_t *block) {
#endif
static bool aligned_memchr16ff(const unsigned char *local_huff_data) {
#if 1
#if !defined(__i386__)
__m128i buf = _mm_load_si128((__m128i const*)local_huff_data);
__m128i ff = _mm_set1_epi8(-1);
__m128i res = _mm_cmpeq_epi8(buf, ff);
View
@@ -150,29 +150,34 @@ void VP8ComponentEncoder::process_row(ProbabilityTablesBase &pt,
}
}
uint32_t aligned_block_cost(const AlignedBlock &block) {
uint32_t cost = 16; // .25 cost for zeros
if (VECTORIZE) {
for (int i = 0; i < 64; i+= 8) {
__m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i)));
__m128i v_cost = _mm_set1_epi16(0);
while (!_mm_test_all_zeros(val, val)) {
__m128i mask = _mm_cmpgt_epi16(val, _mm_setzero_si128());
v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2)));
val = _mm_srli_epi16(val, 1);
}
__m128i sum = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8));
sum = _mm_add_epi16(sum ,_mm_srli_si128(sum, 4));
sum = _mm_add_epi16(sum, _mm_srli_si128(sum, 2));
cost += _mm_extract_epi16(sum, 0);
}
} else {
uint32_t scost = 0;
for (int i = 0; i < 64; ++i) {
scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i]));
#ifdef __SSE2__ /* SSE2 or higher instruction set available { */
const __m128i zero = _mm_setzero_si128();
__m128i v_cost;
for (int i = 0; i < 64; i+= 8) {
__m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i)));
v_cost = _mm_set1_epi16(0);
#ifndef __SSE4_1__
while (_mm_movemask_epi8(_mm_cmpeq_epi32(val, zero)) != 0xFFFF)
#else
while (!_mm_test_all_zeros(val, val))
#endif
{
__m128i mask = _mm_cmpgt_epi16(val, zero);
v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2)));
val = _mm_srli_epi16(val, 1);
}
cost = scost;
v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8));
v_cost = _mm_add_epi16(v_cost ,_mm_srli_si128(v_cost, 4));
v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 2));
}
return 16 + _mm_extract_epi16(v_cost, 0);
#else /* } No SSE2 instructions { */
uint32_t scost = 0;
for (int i = 0; i < 64; ++i) {
scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i]));
}
return cost;
return scost;
#endif /* } */
}
#ifdef ALLOW_FOUR_COLORS
View
@@ -11,9 +11,7 @@
#include "branch.hh"
#include "../util/aligned_block.hh"
#include "../util/block_based_image.hh"
#include <smmintrin.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "../util/mm_mullo_epi32.hh"
class BoolEncoder;
constexpr bool advanced_dc_prediction = true;
View
@@ -8,8 +8,8 @@
// for std::min
#include <algorithm>
#include <assert.h>
#include <smmintrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#include "../util/mm_mullo_epi32.hh"
#ifdef _WIN32
#include <intrin.h>
@@ -0,0 +1,47 @@
/**
# $FreeBSD$
# @(#)COPYRIGHT 8.2 (Berkeley) 3/21/94
The compilation of software known as the FreeBSD Ports Collection is
distributed under the following terms:
Copyright (C) 1994-2016 The FreeBSD Project. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
#if defined(__SSE2__) && !defined(__SSE4_1__) && !defined(MM_MULLO_EPI32_H)
#define MM_MULLO_EPI32_H
#include <immintrin.h>
// See: http://stackoverflow.com/questions/10500766/sse-multiplication-of-4-32-bit-integers
// and https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
static inline __m128i
_mm_mullo_epi32(const __m128i &a, const __m128i &b)
{
__m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4),
_mm_srli_si128(b,4)); /* mul 3,1 */
return _mm_unpacklo_epi32( /* shuffle results to [63..0] and pack */
_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)),
_mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0)));
}
#endif

1 comment on commit c5fd505

@UnitedMarsupials

This comment has been minimized.

Show comment
Hide comment
@UnitedMarsupials

UnitedMarsupials Jul 17, 2016

Contributor

Correction -- the check for __SSE2__ should be replaced with __SSSE3__ -- some of the intrinsics used aren't found on SSE2-only CPUs. Sorry -- just found this out myself on a box with two Opteron 265s.

More specifically, the newly-introduced replacement for _mm_mullo_epi32 will work on SSE2, but the idct_sse, I think, needs SSSE3 as a minimum...

Contributor

UnitedMarsupials commented on c5fd505 Jul 17, 2016

Correction -- the check for __SSE2__ should be replaced with __SSSE3__ -- some of the intrinsics used aren't found on SSE2-only CPUs. Sorry -- just found this out myself on a box with two Opteron 265s.

More specifically, the newly-introduced replacement for _mm_mullo_epi32 will work on SSE2, but the idct_sse, I think, needs SSSE3 as a minimum...

Please sign in to comment.