Skip to content

Commit

Permalink
F16C f16-f32acc rdsum microkernels
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 633140589
  • Loading branch information
alankelly authored and xnnpack-bot committed May 13, 2024
1 parent e5b8377 commit b489d33
Show file tree
Hide file tree
Showing 15 changed files with 2,343 additions and 13 deletions.
4 changes: 4 additions & 0 deletions cmake/gen/f16c_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ SET(ALL_F16C_MICROKERNEL_SRCS
src/f16-avgpool/f16-avgpool-9x-minmax-f16c-c8.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c
src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u8.c
src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c
src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c
Expand Down
4 changes: 4 additions & 0 deletions gen/f16c_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ ALL_F16C_MICROKERNEL_SRCS = [
"src/f16-avgpool/f16-avgpool-9x-minmax-f16c-c8.c",
"src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c",
"src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c",
"src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c",
"src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c",
"src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c",
"src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c",
"src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u8.c",
"src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c",
"src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c",
Expand Down
12 changes: 11 additions & 1 deletion scripts/generate-f16-f32acc-rdsum.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,25 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#################################### NEON ######################################
#################################### NEON #####################################
tools/xngen src/f16-f32acc-rdsum/neon.c.in -D CHANNELS_BATCH=16 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c &
tools/xngen src/f16-f32acc-rdsum/neon.c.in -D CHANNELS_BATCH=32 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c32.c &
tools/xngen src/f16-f32acc-rdsum/neon.c.in -D CHANNELS_BATCH=64 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c64.c &

################################## x86 AVX ####################################
tools/xngen src/f16-f32acc-rdsum/avx.c.in -D CHANNELS_BATCH=16 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c &
tools/xngen src/f16-f32acc-rdsum/avx.c.in -D CHANNELS_BATCH=32 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c &
tools/xngen src/f16-f32acc-rdsum/avx.c.in -D CHANNELS_BATCH=64 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c &
tools/xngen src/f16-f32acc-rdsum/avx.c.in -D CHANNELS_BATCH=128 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c &

################################## x86 AVX512 #################################
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=16 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c16.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=32 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c32.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=64 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c64.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=128 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c128.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=16 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c16.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=32 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c32.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=64 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c64.c &
tools/xngen src/f16-f32acc-rdsum/avx512skx.c.in -D CHANNELS_BATCH=128 -D ACCUMULATORS=7 -o src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c128.c &

wait
144 changes: 144 additions & 0 deletions src/f16-f32acc-rdsum/avx.c.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <assert.h>

#include <immintrin.h>

#include <xnnpack/common.h>
#include <xnnpack/reduce.h>
#include <xnnpack/math.h>


$UNROLL = CHANNELS_BATCH >> 3
void xnn_f16_f32acc_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__f16c_c${CHANNELS_BATCH}(
size_t rows,
size_t channels,
const void* input,
size_t input_stride,
const void* zero,
void* output,
const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(rows != 0);
assert(channels != 0);
assert(input != NULL);
assert(output != NULL);

const __m256 vscale = _mm256_set1_ps(params->avx.scale);

size_t input_increment = ${ACCUMULATORS} * input_stride;
for (; channels >= ${CHANNELS_BATCH}; channels -= ${CHANNELS_BATCH}) {
const uint16_t* i0 = input;
$for ACC in range(1, ACCUMULATORS):
const uint16_t* i${ACC} = (const uint16_t*) ((uintptr_t) input + ${ACC} * input_stride);

$for i in range(UNROLL):
__m256 vacc${i} = _mm256_setzero_ps();

for (int r = rows; r > 0; r -= ${ACCUMULATORS}) {
$for ACC in range(1, ACCUMULATORS, 2):
if XNN_UNPREDICTABLE(r < ${ACC+1}) {
i${ACC} = zero;
}
if XNN_UNPREDICTABLE(r <= ${ACC+1}) {
i${ACC+1} = zero;
}
$for c in range(UNROLL):
__m256 vin${c};
$for j in range(ACCUMULATORS):
$for c in range(UNROLL):
vin${c} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (&i${j}[${c*8}])));
$for c in range(UNROLL):
vacc${c} = _mm256_add_ps(vin${c}, vacc${c});
$for ACC in range(0, ACCUMULATORS):
i${ACC} = (const uint16_t*) ((uintptr_t) i${ACC} + input_increment);
}
$for i in range(UNROLL):
vacc${i} = _mm256_mul_ps(vacc${i}, vscale);

const uint16_t* o = output;
$for i in range(0, UNROLL):
__m256 vo${i} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o)); o = (const void*) ((uintptr_t) o + 8 * sizeof(uint16_t));
$for i in range(0, UNROLL):
vacc${i} = _mm256_add_ps(vo${i}, vacc${i});
$for i in range(0, UNROLL):
_mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vacc${i}, _MM_FROUND_TO_NEAREST_INT)); output = (void*) ((uintptr_t) output + 8 * sizeof(uint16_t));

input = (const uint16_t*) ((uintptr_t) input + ${CHANNELS_BATCH} * sizeof(uint16_t));
}
if (channels != 0) {
input_increment = ${ACCUMULATORS} * input_stride;
const uint16_t* i0 = input;
$for ACC in range(1, ACCUMULATORS):
const uint16_t* i${ACC} = (const uint16_t*) ((uintptr_t) input + ${ACC} * input_stride);
__m256 vacc[${UNROLL}];
$for i in range(UNROLL):
vacc[${i}] = _mm256_setzero_ps();

const size_t num_full_chunks = channels >> 3;
const size_t num_chunks = round_up_po2(channels, 8) >> 3;
const size_t remainder = channels & 0x7;
for (int r = rows; r > 0; r -= ${ACCUMULATORS}) {
$for ACC in range(1, ACCUMULATORS, 2):
if XNN_UNPREDICTABLE(r < ${ACC+1}) {
i${ACC} = zero;
}
if XNN_UNPREDICTABLE(r <= ${ACC+1}) {
i${ACC+1} = zero;
}
for (int i = 0; i < num_full_chunks; ++i) {
$for c in range(ACCUMULATORS):
vacc[i] = _mm256_add_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) &i${c}[i*8])), vacc[i]);
}

if (remainder) {
$for c in range(ACCUMULATORS):
vacc[num_full_chunks] = _mm256_add_ps(vacc[num_full_chunks], _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) &i${c}[num_full_chunks*8])));
}
$for ACC in range(ACCUMULATORS):
i${ACC} = (const uint16_t*) ((uintptr_t) i${ACC} + input_increment);
}
for (size_t i = 0; i < num_chunks; ++i) {
vacc[i] = _mm256_mul_ps(vacc[i], vscale);
}

__m256 vo[${UNROLL}];
const uint16_t* o = output;
for (int i = 0; i < num_full_chunks; ++i) {
vo[i] = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o)); o = (const void*) ((uintptr_t) o + 8 * sizeof(uint16_t));
}
for (int i = 0; i < num_full_chunks; ++i) {
vacc[i] = _mm256_add_ps(vo[i], vacc[i]);
}
for (int i = 0; i < num_full_chunks; ++i) {
_mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vacc[i], _MM_FROUND_TO_NEAREST_INT)); output = (void*) ((uintptr_t) output + 8 * sizeof(uint16_t));
}
if (remainder) {
__m256 vout = vacc[num_full_chunks];
__m128 vout_low = _mm256_castps256_ps128(vout);
if (channels & 4) {
__m128 vo = _mm_cvtph_ps(_mm_loadl_epi64((__m128i*) output));
vo = _mm_add_ps(vout_low, vo);
_mm_storel_epi64((__m128i*) output, _mm_cvtps_ph(vo, _MM_FROUND_TO_NEAREST_INT));
vout_low = _mm256_castps256_ps128(_mm256_permute2f128_ps(vout, vout, 1));
output = (void*) ((uintptr_t) output + 4 * sizeof(uint16_t));
}
if (channels & 2) {
__m128 vo = _mm_cvtph_ps(_mm_loadu_si32(output));
vo = _mm_add_ps(vout_low, vo);
_mm_storeu_si32(output, _mm_cvtps_ph(vo, _MM_FROUND_TO_NEAREST_INT));
vout_low = _mm_movehl_ps(vout_low, vout_low);
output = (void*) ((uintptr_t) output + 2 * sizeof(uint16_t));
}
if (channels & 1) {
__m128 vo = _mm_cvtph_ps(_mm_loadu_si16(output));
vo = _mm_add_ps(vout_low, vo);
_mm_storeu_si16(output, _mm_cvtps_ph(vo, _MM_FROUND_TO_NEAREST_INT));
}
}
}
}
Loading

0 comments on commit b489d33

Please sign in to comment.