Skip to content

Commit

Permalink
Merge pull request #340 from p15-git-acc/threaded-dft
Browse files Browse the repository at this point in the history
add radix2 dft multi-threading
  • Loading branch information
fredrik-johansson committed Sep 25, 2020
2 parents 2160002 + 4876d9a commit 2f9eed4
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 15 deletions.
4 changes: 4 additions & 0 deletions acb_dft.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ void acb_dft_rad2(acb_ptr w, acb_srcptr v, int e, slong prec);
void acb_dft_bluestein(acb_ptr w, acb_srcptr v, slong len, slong prec);
void acb_dft_prod(acb_ptr w, acb_srcptr v, slong * cyc, slong num, slong prec);

void acb_dft_rad2_inplace_threaded(acb_ptr v, int e, slong prec);

void acb_dft_convol_naive(acb_ptr w, acb_srcptr f, acb_srcptr g, slong len, slong prec);
void acb_dft_convol_dft(acb_ptr w, acb_srcptr f, acb_srcptr g, slong len, slong prec);
void acb_dft_convol_rad2(acb_ptr w, acb_srcptr f, acb_srcptr g, slong len, slong prec);
Expand Down Expand Up @@ -189,6 +191,8 @@ void acb_dft_crt_precomp(acb_ptr w, acb_srcptr v, const acb_dft_crt_t crt, slong
void acb_dft_prod_precomp(acb_ptr w, acb_srcptr v, const acb_dft_prod_t prod, slong prec);
void acb_dft_bluestein_precomp(acb_ptr w, acb_srcptr v, const acb_dft_bluestein_t t, slong prec);

void acb_dft_rad2_precomp_inplace_threaded(acb_ptr v, const acb_dft_rad2_t rad2, slong prec);

void acb_dft_inverse_rad2_precomp_inplace(acb_ptr v, const acb_dft_rad2_t rad2, slong prec);
void acb_dft_inverse_rad2_precomp(acb_ptr w, acb_srcptr v, const acb_dft_rad2_t rad2, slong prec);
void acb_dft_convol_rad2_precomp(acb_ptr w, acb_srcptr f, acb_srcptr g, slong len, const acb_dft_rad2_t, slong prec);
Expand Down
37 changes: 22 additions & 15 deletions acb_dft/rad2.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,31 @@ _acb_dft_rad2_init(acb_dft_rad2_t t, slong dv, int e, slong prec)
void
acb_dft_rad2_precomp_inplace(acb_ptr v, const acb_dft_rad2_t rad2, slong prec)
{
slong j, k, l;
slong n = rad2->n, nz = rad2->nz;
acb_ptr p, vend = v + n, w = rad2->z;
acb_t tmp;
acb_init(tmp);
if (flint_get_num_threads() > 1 && rad2-> e > 9)
{
acb_dft_rad2_precomp_inplace_threaded(v, rad2, prec);
}
else
{
slong j, k, l;
slong n = rad2->n, nz = rad2->nz;
acb_ptr p, vend = v + n, w = rad2->z;
acb_t tmp;
acb_init(tmp);

acb_dft_rad2_reorder(v, n);
acb_dft_rad2_reorder(v, n);

for (k = 1, l = nz; k < n; k <<= 1, l >>= 1)
for (p = v; p < vend; p += k)
for (j = 0; j < nz; j += l, p++)
{
acb_mul(tmp, p + k, w + j, prec);
acb_sub(p + k, p + 0, tmp, prec);
acb_add(p + 0, p + 0, tmp, prec);
}
for (k = 1, l = nz; k < n; k <<= 1, l >>= 1)
for (p = v; p < vend; p += k)
for (j = 0; j < nz; j += l, p++)
{
acb_mul(tmp, p + k, w + j, prec);
acb_sub(p + k, p + 0, tmp, prec);
acb_add(p + 0, p + 0, tmp, prec);
}

acb_clear(tmp);
acb_clear(tmp);
}
}

void
Expand Down
125 changes: 125 additions & 0 deletions acb_dft/rad2_threaded.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
Copyright (C) 2016 Pascal Molin
Copyright (C) 2020 D.H.J. Polymath
This file is part of Arb.
Arb is free software: you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License (LGPL) as published
by the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version. See <http://www.gnu.org/licenses/>.
*/

#include "acb_dft.h"
#include "pthread.h"

void acb_dft_rad2_reorder(acb_ptr v, slong n);

typedef struct
{
acb_ptr v;
acb_ptr vend;
slong k;
slong l;
slong jstart;
slong jend;
acb_srcptr w;
slong prec;
}
acb_dft_rad2_arg_t;

void *
_acb_dft_rad2_thread(void * arg_ptr)
{
acb_dft_rad2_arg_t arg = *((acb_dft_rad2_arg_t *) arg_ptr);
slong j, rstart, pstep;
acb_ptr p, r;
acb_t tmp;

acb_init(tmp);
rstart = arg.jstart / arg.l;
pstep = 2 * arg.k;

for (p = arg.v; p < arg.vend; p += pstep)
{
for (r = p + rstart, j = arg.jstart; j < arg.jend; j += arg.l, r++)
{
acb_mul(tmp, r + arg.k, arg.w + j, arg.prec);
acb_sub(r + arg.k, r + 0, tmp, arg.prec);
acb_add(r + 0, r + 0, tmp, arg.prec);
}
}

acb_clear(tmp);
flint_cleanup();
return NULL;
}

void
acb_dft_rad2_precomp_inplace_threaded(acb_ptr v, const acb_dft_rad2_t rad2, slong prec)
{
slong num_threads;
pthread_t * threads;
acb_dft_rad2_arg_t * args;

slong t, logt, logk, logl;
slong n = rad2->n;
slong nz = rad2->nz; /* always n/2 ? */
slong logn = rad2->e;

num_threads = FLINT_MIN(flint_get_num_threads(), nz);
for (logt = 0; WORD(1) << (logt + 1) <= num_threads; logt++);
t = WORD(1) << logt;

threads = flint_malloc(sizeof(pthread_t) * t);
args = flint_malloc(sizeof(acb_dft_rad2_arg_t) * t);

acb_dft_rad2_reorder(v, n);

for (logk = 0, logl = logn - 1; logk < logn; logk += 1, logl -= 1)
{
slong i, j, p;
slong logpstep = logk + 1 + FLINT_MAX(0, logl - logt);
slong logjstep = logl + FLINT_MIN(logk, logn - 1 - logt);
slong pstep = WORD(1) << logpstep;
slong jstep = WORD(1) << logjstep;
i = 0;
for (p = 0; p < n; p += pstep)
{
for (j = 0; j < nz ; j += jstep)
{
args[i].v = v + p;
args[i].vend = v + p + pstep;
args[i].jstart = j;
args[i].jend = j + jstep;
args[i].k = WORD(1) << logk;
args[i].l = WORD(1) << logl;
args[i].w = rad2->z;
args[i].prec = prec;
pthread_create(&threads[i], NULL, _acb_dft_rad2_thread, &args[i]);
i++;
}
}
if (i != t)
{
flint_printf("threaded dft error: unequal i=%ld t=%ld\n", i, t);
flint_abort();
}
for (i = 0; i < t; i++)
{
pthread_join(threads[i], NULL);
}
}

flint_free(threads);
flint_free(args);
}

void
acb_dft_rad2_inplace_threaded(acb_ptr v, int e, slong prec)
{
acb_dft_rad2_t rad2;
acb_dft_rad2_init(rad2, e, prec);
acb_dft_rad2_precomp_inplace_threaded(v, rad2, prec);
acb_dft_rad2_clear(rad2);
}
23 changes: 23 additions & 0 deletions acb_dft/test/t-dft.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,29 @@ int main()

}

/* multi-threaded radix2 dft */
for (k = 0; k < 11; k++)
{
slong n = 1 << k, j;
acb_ptr v, w1, w2;
v = w2 = _acb_vec_init(n);
w1 = _acb_vec_init(n);

flint_set_num_threads(k % 5 + 1);

for (j = 0; j < n; j++)
acb_set_si_si(v + j, j, j + 2);

acb_dft_cyc(w1, v, n, prec);
acb_dft_rad2_inplace_threaded(w2, k, prec);

check_vec_eq_prec(w1, w2, n, prec, digits, n, "rad2", "cyc", "rad2");

_acb_vec_clear(v, n);
_acb_vec_clear(w1, n);

}

flint_randclear(state);
flint_cleanup();
flint_printf("PASS\n");
Expand Down

0 comments on commit 2f9eed4

Please sign in to comment.