Skip to content

Commit

Permalink
Use more defines for IDCT calculations.
Browse files Browse the repository at this point in the history
  • Loading branch information
foo86 committed Nov 30, 2015
1 parent bdcdf30 commit b35fdcf
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 112 deletions.
21 changes: 15 additions & 6 deletions libdcadec/idct.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,25 @@
#define DCT_B_ROWS 8
#define DCT_B_COLS 7

#define MOD_A_SIZE 16
#define MOD_B_SIZE 8
#define MOD_C_SIZE 32
#define IDCT_SIZE 32
#define IDCT_SIZE_2 (IDCT_SIZE / 2)
#define IDCT_SIZE_4 (IDCT_SIZE / 4)

#define MOD_A_SIZE IDCT_SIZE_2
#define MOD_B_SIZE IDCT_SIZE_4
#define MOD_C_SIZE IDCT_SIZE
#define MOD_A_HALF (MOD_A_SIZE / 2)
#define MOD_B_HALF (MOD_B_SIZE / 2)
#define MOD_C_HALF (MOD_C_SIZE / 2)

#define MOD64_A_SIZE 32
#define MOD64_B_SIZE 16
#define MOD64_C_SIZE 64
#define IDCT64_SIZE 64
#define IDCT64_SIZE_2 (IDCT64_SIZE / 2)
#define IDCT64_SIZE_4 (IDCT64_SIZE / 4)
#define IDCT64_SIZE_8 (IDCT64_SIZE / 8)

#define MOD64_A_SIZE IDCT64_SIZE_2
#define MOD64_B_SIZE IDCT64_SIZE_4
#define MOD64_C_SIZE IDCT64_SIZE
#define MOD64_A_HALF (MOD64_A_SIZE / 2)
#define MOD64_B_HALF (MOD64_B_SIZE / 2)
#define MOD64_C_HALF (MOD64_C_SIZE / 2)
Expand Down
122 changes: 61 additions & 61 deletions libdcadec/idct_fixed.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,38 +158,38 @@ static void clp_v(int *input, int len)
void idct_perform32_fixed(int * restrict input, int * restrict output)
{
int mag = 0;
for (int i = 0; i < 32; i++)
for (int i = 0; i < IDCT_SIZE; i++)
mag += abs(input[i]);

int shift = mag > 0x400000 ? 2 : 0;
int round = shift > 0 ? 1 << (shift - 1) : 0;

for (int i = 0; i < 32; i++)
for (int i = 0; i < IDCT_SIZE; i++)
input[i] = (input[i] + round) >> shift;

sum_a(input, output + 0, 16);
sum_b(input, output + 16, 16);
clp_v(output, 32);
sum_a(input, output + 0 * IDCT_SIZE_2, IDCT_SIZE_2);
sum_b(input, output + 1 * IDCT_SIZE_2, IDCT_SIZE_2);
clp_v(output, IDCT_SIZE);

sum_a(output + 0, input + 0, 8);
sum_b(output + 0, input + 8, 8);
sum_c(output + 16, input + 16, 8);
sum_d(output + 16, input + 24, 8);
clp_v(input, 32);
sum_a(output + 0 * IDCT_SIZE_2, input + 0 * IDCT_SIZE_4, IDCT_SIZE_4);
sum_b(output + 0 * IDCT_SIZE_2, input + 1 * IDCT_SIZE_4, IDCT_SIZE_4);
sum_c(output + 1 * IDCT_SIZE_2, input + 2 * IDCT_SIZE_4, IDCT_SIZE_4);
sum_d(output + 1 * IDCT_SIZE_2, input + 3 * IDCT_SIZE_4, IDCT_SIZE_4);
clp_v(input, IDCT_SIZE);

dct_a(input + 0, output + 0);
dct_b(input + 8, output + 8);
dct_b(input + 16, output + 16);
dct_b(input + 24, output + 24);
clp_v(output, 32);
dct_a(input + 0 * IDCT_SIZE_4, output + 0 * IDCT_SIZE_4);
dct_b(input + 1 * IDCT_SIZE_4, output + 1 * IDCT_SIZE_4);
dct_b(input + 2 * IDCT_SIZE_4, output + 2 * IDCT_SIZE_4);
dct_b(input + 3 * IDCT_SIZE_4, output + 3 * IDCT_SIZE_4);
clp_v(output, IDCT_SIZE);

mod_a(output + 0, input + 0);
mod_b(output + 16, input + 16);
clp_v(input, 32);
mod_a(output + 0 * IDCT_SIZE_2, input + 0 * IDCT_SIZE_2);
mod_b(output + 1 * IDCT_SIZE_2, input + 1 * IDCT_SIZE_2);
clp_v(input, IDCT_SIZE);

mod_c(input, output);

for (int i = 0; i < 32; i++)
for (int i = 0; i < IDCT_SIZE; i++)
output[i] = clip23(output[i] * (1 << shift));
}

Expand Down Expand Up @@ -268,57 +268,57 @@ static void mod64_c(const int * restrict input, int * restrict output)
void idct_perform64_fixed(int * restrict input, int * restrict output)
{
int mag = 0;
for (int i = 0; i < 64; i++)
for (int i = 0; i < IDCT64_SIZE; i++)
mag += abs(input[i]);

int shift = mag > 0x400000 ? 2 : 0;
int round = shift > 0 ? 1 << (shift - 1) : 0;

for (int i = 0; i < 64; i++)
for (int i = 0; i < IDCT64_SIZE; i++)
input[i] = (input[i] + round) >> shift;

sum_a(input, output + 0, 32);
sum_b(input, output + 32, 32);
clp_v(output, 64);

sum_a(output + 0, input + 0, 16);
sum_b(output + 0, input + 16, 16);
sum_c(output + 32, input + 32, 16);
sum_d(output + 32, input + 48, 16);
clp_v(input, 64);

sum_a(input + 0, output + 0, 8);
sum_b(input + 0, output + 8, 8);
sum_c(input + 16, output + 16, 8);
sum_d(input + 16, output + 24, 8);
sum_c(input + 32, output + 32, 8);
sum_d(input + 32, output + 40, 8);
sum_c(input + 48, output + 48, 8);
sum_d(input + 48, output + 56, 8);
clp_v(output, 64);

dct_a(output + 0, input + 0);
dct_b(output + 8, input + 8);
dct_b(output + 16, input + 16);
dct_b(output + 24, input + 24);
dct_b(output + 32, input + 32);
dct_b(output + 40, input + 40);
dct_b(output + 48, input + 48);
dct_b(output + 56, input + 56);
clp_v(input, 64);

mod_a(input + 0, output + 0);
mod_b(input + 16, output + 16);
mod_b(input + 32, output + 32);
mod_b(input + 48, output + 48);
clp_v(output, 64);

mod64_a(output + 0, input + 0);
mod64_b(output + 32, input + 32);
clp_v(input, 64);
sum_a(input, output + 0 * IDCT64_SIZE_2, IDCT64_SIZE_2);
sum_b(input, output + 1 * IDCT64_SIZE_2, IDCT64_SIZE_2);
clp_v(output, IDCT64_SIZE);

sum_a(output + 0 * IDCT64_SIZE_2, input + 0 * IDCT64_SIZE_4, IDCT64_SIZE_4);
sum_b(output + 0 * IDCT64_SIZE_2, input + 1 * IDCT64_SIZE_4, IDCT64_SIZE_4);
sum_c(output + 1 * IDCT64_SIZE_2, input + 2 * IDCT64_SIZE_4, IDCT64_SIZE_4);
sum_d(output + 1 * IDCT64_SIZE_2, input + 3 * IDCT64_SIZE_4, IDCT64_SIZE_4);
clp_v(input, IDCT64_SIZE);

sum_a(input + 0 * IDCT64_SIZE_4, output + 0 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_b(input + 0 * IDCT64_SIZE_4, output + 1 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_c(input + 1 * IDCT64_SIZE_4, output + 2 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_d(input + 1 * IDCT64_SIZE_4, output + 3 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_c(input + 2 * IDCT64_SIZE_4, output + 4 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_d(input + 2 * IDCT64_SIZE_4, output + 5 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_c(input + 3 * IDCT64_SIZE_4, output + 6 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_d(input + 3 * IDCT64_SIZE_4, output + 7 * IDCT64_SIZE_8, IDCT64_SIZE_8);
clp_v(output, IDCT64_SIZE);

dct_a(output + 0 * IDCT64_SIZE_8, input + 0 * IDCT64_SIZE_8);
dct_b(output + 1 * IDCT64_SIZE_8, input + 1 * IDCT64_SIZE_8);
dct_b(output + 2 * IDCT64_SIZE_8, input + 2 * IDCT64_SIZE_8);
dct_b(output + 3 * IDCT64_SIZE_8, input + 3 * IDCT64_SIZE_8);
dct_b(output + 4 * IDCT64_SIZE_8, input + 4 * IDCT64_SIZE_8);
dct_b(output + 5 * IDCT64_SIZE_8, input + 5 * IDCT64_SIZE_8);
dct_b(output + 6 * IDCT64_SIZE_8, input + 6 * IDCT64_SIZE_8);
dct_b(output + 7 * IDCT64_SIZE_8, input + 7 * IDCT64_SIZE_8);
clp_v(input, IDCT64_SIZE);

mod_a(input + 0 * IDCT64_SIZE_4, output + 0 * IDCT64_SIZE_4);
mod_b(input + 1 * IDCT64_SIZE_4, output + 1 * IDCT64_SIZE_4);
mod_b(input + 2 * IDCT64_SIZE_4, output + 2 * IDCT64_SIZE_4);
mod_b(input + 3 * IDCT64_SIZE_4, output + 3 * IDCT64_SIZE_4);
clp_v(output, IDCT64_SIZE);

mod64_a(output + 0 * IDCT64_SIZE_2, input + 0 * IDCT64_SIZE_2);
mod64_b(output + 1 * IDCT64_SIZE_2, input + 1 * IDCT64_SIZE_2);
clp_v(input, IDCT64_SIZE);

mod64_c(input, output);

for (int i = 0; i < 64; i++)
for (int i = 0; i < IDCT64_SIZE; i++)
output[i] = clip23(output[i] * (1 << shift));
}
90 changes: 45 additions & 45 deletions libdcadec/idct_float.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,21 @@ static void mod_c(const struct idct_context * restrict idct,
void idct_perform32_float(const struct idct_context * restrict idct,
double * restrict input, double * restrict output)
{
sum_a(input, output + 0, 16);
sum_b(input, output + 16, 16);
sum_a(input, output + 0 * IDCT_SIZE_2, IDCT_SIZE_2);
sum_b(input, output + 1 * IDCT_SIZE_2, IDCT_SIZE_2);

sum_a(output + 0, input + 0, 8);
sum_b(output + 0, input + 8, 8);
sum_c(output + 16, input + 16, 8);
sum_d(output + 16, input + 24, 8);
sum_a(output + 0 * IDCT_SIZE_2, input + 0 * IDCT_SIZE_4, IDCT_SIZE_4);
sum_b(output + 0 * IDCT_SIZE_2, input + 1 * IDCT_SIZE_4, IDCT_SIZE_4);
sum_c(output + 1 * IDCT_SIZE_2, input + 2 * IDCT_SIZE_4, IDCT_SIZE_4);
sum_d(output + 1 * IDCT_SIZE_2, input + 3 * IDCT_SIZE_4, IDCT_SIZE_4);

dct_a(idct, input + 0, output + 0);
dct_b(idct, input + 8, output + 8);
dct_b(idct, input + 16, output + 16);
dct_b(idct, input + 24, output + 24);
dct_a(idct, input + 0 * IDCT_SIZE_4, output + 0 * IDCT_SIZE_4);
dct_b(idct, input + 1 * IDCT_SIZE_4, output + 1 * IDCT_SIZE_4);
dct_b(idct, input + 2 * IDCT_SIZE_4, output + 2 * IDCT_SIZE_4);
dct_b(idct, input + 3 * IDCT_SIZE_4, output + 3 * IDCT_SIZE_4);

mod_a(idct, output + 0, input + 0);
mod_b(idct, output + 16, input + 16);
mod_a(idct, output + 0 * IDCT_SIZE_2, input + 0 * IDCT_SIZE_2);
mod_b(idct, output + 1 * IDCT_SIZE_2, input + 1 * IDCT_SIZE_2);

mod_c(idct, input, output);
}
Expand Down Expand Up @@ -216,39 +216,39 @@ static void mod64_c(const struct idct_context * restrict idct,
void idct_perform64_float(const struct idct_context * restrict idct,
double * restrict input, double * restrict output)
{
sum_a(input, output + 0, 32);
sum_b(input, output + 32, 32);

sum_a(output + 0, input + 0, 16);
sum_b(output + 0, input + 16, 16);
sum_c(output + 32, input + 32, 16);
sum_d(output + 32, input + 48, 16);

sum_a(input + 0, output + 0, 8);
sum_b(input + 0, output + 8, 8);
sum_c(input + 16, output + 16, 8);
sum_d(input + 16, output + 24, 8);
sum_c(input + 32, output + 32, 8);
sum_d(input + 32, output + 40, 8);
sum_c(input + 48, output + 48, 8);
sum_d(input + 48, output + 56, 8);

dct_a(idct, output + 0, input + 0);
dct_b(idct, output + 8, input + 8);
dct_b(idct, output + 16, input + 16);
dct_b(idct, output + 24, input + 24);
dct_b(idct, output + 32, input + 32);
dct_b(idct, output + 40, input + 40);
dct_b(idct, output + 48, input + 48);
dct_b(idct, output + 56, input + 56);

mod_a(idct, input + 0, output + 0);
mod_b(idct, input + 16, output + 16);
mod_b(idct, input + 32, output + 32);
mod_b(idct, input + 48, output + 48);

mod64_a(idct, output + 0, input + 0);
mod64_b(idct, output + 32, input + 32);
sum_a(input, output + 0 * IDCT64_SIZE_2, IDCT64_SIZE_2);
sum_b(input, output + 1 * IDCT64_SIZE_2, IDCT64_SIZE_2);

sum_a(output + 0 * IDCT64_SIZE_2, input + 0 * IDCT64_SIZE_4, IDCT64_SIZE_4);
sum_b(output + 0 * IDCT64_SIZE_2, input + 1 * IDCT64_SIZE_4, IDCT64_SIZE_4);
sum_c(output + 1 * IDCT64_SIZE_2, input + 2 * IDCT64_SIZE_4, IDCT64_SIZE_4);
sum_d(output + 1 * IDCT64_SIZE_2, input + 3 * IDCT64_SIZE_4, IDCT64_SIZE_4);

sum_a(input + 0 * IDCT64_SIZE_4, output + 0 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_b(input + 0 * IDCT64_SIZE_4, output + 1 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_c(input + 1 * IDCT64_SIZE_4, output + 2 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_d(input + 1 * IDCT64_SIZE_4, output + 3 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_c(input + 2 * IDCT64_SIZE_4, output + 4 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_d(input + 2 * IDCT64_SIZE_4, output + 5 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_c(input + 3 * IDCT64_SIZE_4, output + 6 * IDCT64_SIZE_8, IDCT64_SIZE_8);
sum_d(input + 3 * IDCT64_SIZE_4, output + 7 * IDCT64_SIZE_8, IDCT64_SIZE_8);

dct_a(idct, output + 0 * IDCT64_SIZE_8, input + 0 * IDCT64_SIZE_8);
dct_b(idct, output + 1 * IDCT64_SIZE_8, input + 1 * IDCT64_SIZE_8);
dct_b(idct, output + 2 * IDCT64_SIZE_8, input + 2 * IDCT64_SIZE_8);
dct_b(idct, output + 3 * IDCT64_SIZE_8, input + 3 * IDCT64_SIZE_8);
dct_b(idct, output + 4 * IDCT64_SIZE_8, input + 4 * IDCT64_SIZE_8);
dct_b(idct, output + 5 * IDCT64_SIZE_8, input + 5 * IDCT64_SIZE_8);
dct_b(idct, output + 6 * IDCT64_SIZE_8, input + 6 * IDCT64_SIZE_8);
dct_b(idct, output + 7 * IDCT64_SIZE_8, input + 7 * IDCT64_SIZE_8);

mod_a(idct, input + 0 * IDCT64_SIZE_4, output + 0 * IDCT64_SIZE_4);
mod_b(idct, input + 1 * IDCT64_SIZE_4, output + 1 * IDCT64_SIZE_4);
mod_b(idct, input + 2 * IDCT64_SIZE_4, output + 2 * IDCT64_SIZE_4);
mod_b(idct, input + 3 * IDCT64_SIZE_4, output + 3 * IDCT64_SIZE_4);

mod64_a(idct, output + 0 * IDCT64_SIZE_2, input + 0 * IDCT64_SIZE_2);
mod64_b(idct, output + 1 * IDCT64_SIZE_2, input + 1 * IDCT64_SIZE_2);

mod64_c(idct, input, output);
}

0 comments on commit b35fdcf

Please sign in to comment.