279 changes: 213 additions & 66 deletions fmprb_poly/mullow_block2.c
Expand Up @@ -23,11 +23,24 @@
******************************************************************************/

#include <math.h>
#include "fmprb_poly.h"

void _fmprb_poly_get_scale(fmpz_t scale, fmprb_srcptr x, long xlen,
fmprb_srcptr y, long ylen);

static int
_fmprb_vec_is_finite(fmprb_srcptr x, long len)
{
long i;

for (i = 0; i < len; i++)
if (!fmprb_is_finite(x + i))
return 0;

return 1;
}

/*static __inline__ */ void
fmpr_add_abs_ubound(fmpr_t z, const fmpr_t x, const fmpr_t y, long prec)
{
Expand Down Expand Up @@ -60,16 +73,61 @@ fmpr_abs_round(fmpr_t z, const fmpr_t x, long prec, fmpr_rnd_t rnd)
}

/* Break fmpr vector into same-exponent blocks where the largest block
has a height of at most ALPHA*prec + BETA bits. */
has a height of at most ALPHA*prec + BETA bits. These are just
tuning parameters. Note that ALPHA * FMPRB_RAD_PREC + BETA
should be smaller than DOUBLE_BLOCK_MAX_HEIGHT if we want to use
doubles for error bounding. */
#define ALPHA 3.0
#define BETA 512

void
_fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
const fmpz_t scale, fmpr_srcptr x, long len, long step, long prec)

/* Maximum length of block for which we use double multiplication
(for longer blocks, we use fmpz_poly multiplication). This is essentially
just a tuning parameter, but note that it must be considered when
compensating for rounding error below. */
#define DOUBLE_BLOCK_MAX_LENGTH 1000

/* Computing a dot product of length DOUBLE_BLOCK_MAX_LENGTH involving
only nonnegative numbers, and then multiplying by this factor, must give
an upper bound for the exact dot product (we can assume that no
overflow or underflow occurs). The following is certainly
sufficient, but it would be nice to include a formal proof here. */
#define DOUBLE_ROUNDING_FACTOR (1.0 + 1e-9)

/* Maximum height for which we use double multiplication. Since the dynamic
exponent range of doubles is about +/- 1024, this must be less than about
1024 (to allow the product of two numbers). This must also
account for adding FMPRB_RAD_PREC bits. */
#define DOUBLE_BLOCK_MAX_HEIGHT 800

/* We divide coefficients by 2^DOUBLE_BLOCK_SHIFT when converting them to
doubles, in order to use the whole exponent range. Note that this means
numbers of size (2^(-DOUBLE_BLOCK_SHIFT))^2 must not underflow. */
#define DOUBLE_BLOCK_SHIFT (DOUBLE_BLOCK_MAX_HEIGHT / 2)


/* Converts fmpr vector to a vector of blocks, where each block is
a vector of fmpz integers on a common exponent.
Optionally, we also generate doubles on the same common exponent
(minus DOUBLE_BLOCK_SHIFT), where this can be done exactly. */

static __inline__ int /* returns new can_use_doubles status */
_fmpr_vec_get_fmpz_2exp_blocks
(
fmpz * coeffs, /* output fmpz coefficients */
double * dblcoeffs, /* output double coefficients (optional) */
fmpz * exps, /* common exponent of each block */
long * blocks, /* start positions of blocks (plus end marker) */
const fmpz_t scale, /* compose input poly by x -> x/2^scale */
fmpr_srcptr x, /* first in vector of input coefficient */
long len, /* number of input coefficients */
long step, /* step length to read input coefficients*/
long prec, /* prec */
int can_use_doubles
)
{
fmpz_t top, bot, t, b, v, block_top, block_bot;
long i, j, s, block;
long i, j, s, block, bits, maxheight;
int in_zero;

fmpz_init(top);
Expand All @@ -84,17 +142,26 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
block = 0;
in_zero = 1;

if (prec == FMPR_PREC_EXACT)
maxheight = FMPR_PREC_EXACT;
else
maxheight = ALPHA * prec + BETA;

can_use_doubles = can_use_doubles && (maxheight <= DOUBLE_BLOCK_MAX_HEIGHT);

for (i = 0; i < len; i++)
{
/* Skip (must be zero, since we assume there are no Infs/NaNs) */
/* Skip (must be zero, since we assume there are no Infs/NaNs). */
if (fmpr_is_special(x + i * step))
continue;

/* Bottom and top exponent of current number */
fmpz_set(bot, fmpr_expref(x + i * step));
/* Divide coefficient by 2^(scale * i) */
fmpz_submul_ui(bot, scale, i);
fmpz_add_ui(top, bot, fmpz_bits(fmpr_manref(x + i * step)));
bits = fmpz_bits(fmpr_manref(x + i * step));
fmpz_add_ui(top, bot, bits);

can_use_doubles = can_use_doubles && (bits <= FMPRB_RAD_PREC);

/* Extend current block. */
if (in_zero)
Expand All @@ -109,7 +176,7 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
fmpz_sub(v, t, b);

/* extend current block */
if (prec == FMPR_PREC_EXACT || fmpz_cmp_ui(v, ALPHA * prec + BETA) < 0)
if (fmpz_cmp_ui(v, maxheight) < 0)
{
fmpz_swap(block_top, t);
fmpz_swap(block_bot, b);
Expand Down Expand Up @@ -141,9 +208,12 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
{
for (j = blocks[i]; j < blocks[i + 1]; j++)
{
if (fmpr_is_zero(x + j * step))
if (fmpr_is_special(x + j * step))
{
fmpz_zero(coeffs + j);

if (can_use_doubles)
dblcoeffs[j] = 0.0;
}
else
{
Expand All @@ -155,6 +225,17 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
if (s < 0) abort(); /* Bug catcher */

fmpz_mul_2exp(coeffs + j, fmpr_manref(x + j * step), s);

if (can_use_doubles)
{
double c = *fmpr_manref(x + j * step);
c = ldexp(c, s - DOUBLE_BLOCK_SHIFT);

if (c < 1e-150 || c > 1e150) /* Bug catcher */
abort();

dblcoeffs[j] = c;
}
}
}
}
Expand All @@ -166,32 +247,19 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
fmpz_clear(v);
fmpz_clear(block_top);
fmpz_clear(block_bot);
}

static int
has_infnan(fmprb_srcptr x, long len)
{
long i;

for (i = 0; i < len; i++)
{
if (fmpr_is_nan(fmprb_midref(x + i)) || fmpr_is_inf(fmprb_midref(x + i))
|| fmpr_is_nan(fmprb_radref(x + i)) || fmpr_is_inf(fmprb_radref(x + i)))
{
return 1;
}
}

return 0;
return can_use_doubles;
}

void
static __inline__ void
_fmprb_poly_addmullow_rad(fmprb_ptr z, fmpz * zz,
const fmpz * xz, const fmpz * xexps, const long * xblocks, long xlen,
const fmpz * yz, const fmpz * yexps, const long * yblocks, long ylen,
long n)
const fmpz * xz, const double * xdbl, const fmpz * xexps,
const long * xblocks, long xlen,
const fmpz * yz, const double * ydbl, const fmpz * yexps,
const long * yblocks, long ylen,
long n, int can_use_doubles)
{
long i, j, k, xp, yp, xl, yl, bn;
long i, j, k, ii, xp, yp, xl, yl, bn;
fmpz_t zexp;
fmpr_t t;

Expand All @@ -211,20 +279,49 @@ _fmprb_poly_addmullow_rad(fmprb_ptr z, fmpz * zz,
xl = FLINT_MIN(xl, bn);
yl = FLINT_MIN(yl, bn);

if (xl >= yl)
_fmpz_poly_mullow(zz, xz + xp, xl, yz + yp, yl, bn);
else
_fmpz_poly_mullow(zz, yz + yp, yl, xz + xp, xl, bn);

fmpz_add_inline(zexp, xexps + i, yexps + j);

for (k = 0; k < bn; k++)
if (can_use_doubles && xl > 1 && yl > 1 &&
(xl < DOUBLE_BLOCK_MAX_LENGTH || yl < DOUBLE_BLOCK_MAX_LENGTH))
{
fmpr_set_round_fmpz_2exp(t, zz + k, zexp,
FMPRB_RAD_PREC, FMPR_RND_UP);
fmpr_add(fmprb_radref(z + xp + yp + k),
fmprb_radref(z + xp + yp + k), t,
FMPRB_RAD_PREC, FMPR_RND_UP);
fmpz_add_ui(zexp, zexp, 2 * DOUBLE_BLOCK_SHIFT);

for (k = 0; k < bn; k++)
{
/* Classical multiplication (may round down!) */
double ss = 0.0;

for (ii = FLINT_MAX(0, k - yl + 1);
ii <= FLINT_MIN(xl - 1, k); ii++)
{
ss += xdbl[xp + ii] * ydbl[yp + k - ii];
}

/* Compensate for rounding error */
ss *= DOUBLE_ROUNDING_FACTOR;

fmpr_set_d(t, ss);
fmpr_mul_2exp_fmpz(t, t, zexp);
fmpr_add(fmprb_radref(z + xp + yp + k),
fmprb_radref(z + xp + yp + k), t,
FMPRB_RAD_PREC, FMPR_RND_UP);
}
}
else
{
if (xl >= yl)
_fmpz_poly_mullow(zz, xz + xp, xl, yz + yp, yl, bn);
else
_fmpz_poly_mullow(zz, yz + yp, yl, xz + xp, xl, bn);

for (k = 0; k < bn; k++)
{
fmpr_set_round_fmpz_2exp(t, zz + k, zexp,
FMPRB_RAD_PREC, FMPR_RND_UP);
fmpr_add(fmprb_radref(z + xp + yp + k),
fmprb_radref(z + xp + yp + k), t,
FMPRB_RAD_PREC, FMPR_RND_UP);
}
}
}
}
Expand All @@ -233,7 +330,7 @@ _fmprb_poly_addmullow_rad(fmprb_ptr z, fmpz * zz,
fmpr_clear(t);
}

void
static __inline__ void
_fmprb_poly_addmullow_block(fmprb_ptr z, fmpz * zz,
const fmpz * xz, const fmpz * xexps, const long * xblocks, long xlen,
const fmpz * yz, const fmpz * yexps, const long * yblocks, long ylen,
Expand Down Expand Up @@ -334,7 +431,8 @@ _fmprb_poly_mullow_block2(fmprb_ptr z, fmprb_srcptr x, long xlen,
}

/* We don't know how to deal with infinities or NaNs */
if (has_infnan(x, xlen) || (!squaring && has_infnan(y, ylen)))
if (!_fmprb_vec_is_finite(x, xlen) ||
(!squaring && !_fmprb_vec_is_finite(y, ylen)))
{
_fmprb_poly_mullow_classical(z, x, xlen, y, ylen, n, prec);
return;
Expand Down Expand Up @@ -369,74 +467,123 @@ _fmprb_poly_mullow_block2(fmprb_ptr z, fmprb_srcptr x, long xlen,
= (xm*ym) + (xm*yr + xr*(ym + yr)) */
if (xrlen != 0 || yrlen != 0)
{
fmpr_ptr tmp = _fmpr_vec_init(FLINT_MAX(xlen, ylen));
fmpr_ptr tmp;
double *xdbl, *ydbl;
int can_use_doubles = 1;

tmp = _fmpr_vec_init(FLINT_MAX(xlen, ylen));
xdbl = flint_malloc(sizeof(double) * xlen);
ydbl = flint_malloc(sizeof(double) * ylen);

/* (xm + xr)^2 = (xm*ym) + (xr^2 + 2 xm xr)
= (xm*ym) + xr*(2 xm + xr) */
if (squaring)
{
_fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_radref(x), xrlen, 2, FMPRB_RAD_PREC);
can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
xblocks, scale, fmprb_radref(x), xrlen, 2,
FMPRB_RAD_PREC, can_use_doubles);

for (i = 0; i < xlen; i++)
{
fmpr_abs_round(tmp + i, fmprb_midref(x + i), FMPRB_RAD_PREC, FMPR_RND_UP);
fmpr_abs_round(tmp + i, fmprb_midref(x + i),
FMPRB_RAD_PREC, FMPR_RND_UP);
fmpr_mul_2exp_si(tmp + i, tmp + i, 1);
fmpr_add(tmp + i, tmp + i, fmprb_radref(x + i), FMPRB_RAD_PREC, FMPR_RND_UP);
fmpr_add(tmp + i, tmp + i, fmprb_radref(x + i),
FMPRB_RAD_PREC, FMPR_RND_UP);
}
_fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, tmp, xlen, 1, FMPRB_RAD_PREC);

_fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xrlen, yz, ye, yblocks, xlen, n);
can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
yblocks, scale, tmp, xlen, 1,
FMPRB_RAD_PREC, can_use_doubles);

_fmprb_poly_addmullow_rad(z, zz,
xz, xdbl, xe, xblocks, xrlen,
yz, ydbl, ye, yblocks, xlen, n, can_use_doubles);
}
else if (yrlen == 0)
{
/* xr * |ym| */
_fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_radref(x), xrlen, 2, FMPRB_RAD_PREC);
can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
xblocks, scale, fmprb_radref(x), xrlen, 2,
FMPRB_RAD_PREC, can_use_doubles);

for (i = 0; i < ymlen; i++)
fmpr_abs_round(tmp + i, fmprb_midref(y + i), FMPRB_RAD_PREC, FMPR_RND_UP);
_fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, tmp, ymlen, 1, FMPRB_RAD_PREC);
_fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xrlen, yz, ye, yblocks, ymlen, n);
fmpr_abs_round(tmp + i, fmprb_midref(y + i),
FMPRB_RAD_PREC, FMPR_RND_UP);

can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
yblocks, scale, tmp, ymlen, 1,
FMPRB_RAD_PREC, can_use_doubles);

_fmprb_poly_addmullow_rad(z, zz,
xz, xdbl, xe, xblocks, xrlen,
yz, ydbl, ye, yblocks, ymlen, n, can_use_doubles);
}
else
{
/* |xm| * yr */
for (i = 0; i < xmlen; i++)
fmpr_abs_round(tmp + i, fmprb_midref(x + i), FMPRB_RAD_PREC, FMPR_RND_UP);
fmpr_abs_round(tmp + i, fmprb_midref(x + i),
FMPRB_RAD_PREC, FMPR_RND_UP);

_fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, tmp, xmlen, 1, FMPRB_RAD_PREC);
can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
xblocks, scale, tmp, xmlen, 1,
FMPRB_RAD_PREC, can_use_doubles);

_fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, fmprb_radref(y), yrlen, 2, FMPRB_RAD_PREC);
can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
yblocks, scale, fmprb_radref(y), yrlen, 2,
FMPRB_RAD_PREC, can_use_doubles);

_fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xmlen, yz, ye, yblocks, yrlen, n);
_fmprb_poly_addmullow_rad(z, zz,
xz, xdbl, xe, xblocks, xmlen,
yz, ydbl, ye, yblocks, yrlen, n, can_use_doubles);

/* xr*(|ym| + yr) */
if (xrlen != 0)
{
_fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_radref(x), xrlen, 2, FMPRB_RAD_PREC);
can_use_doubles = 1;
can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
xblocks, scale, fmprb_radref(x), xrlen, 2,
FMPRB_RAD_PREC, can_use_doubles);

for (i = 0; i < ylen; i++)
fmpr_add_abs_ubound(tmp + i, fmprb_midref(y + i), fmprb_radref(y + i), FMPRB_RAD_PREC);
fmpr_add_abs_ubound(tmp + i, fmprb_midref(y + i),
fmprb_radref(y + i), FMPRB_RAD_PREC);

can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
yblocks, scale, tmp, ylen, 1,
FMPRB_RAD_PREC, can_use_doubles);

_fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, tmp, ylen, 1, FMPRB_RAD_PREC);
_fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xrlen, yz, ye, yblocks, ylen, n);
_fmprb_poly_addmullow_rad(z, zz,
xz, xdbl, xe, xblocks, xrlen,
yz, ydbl, ye, yblocks, ylen, n, can_use_doubles);
}
}

_fmpr_vec_clear(tmp, FLINT_MAX(xlen, ylen));
flint_free(xdbl);
flint_free(ydbl);
}

/* multiply midpoints */
if (xmlen != 0 && ymlen != 0)
{
_fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_midref(x), xmlen, 2, prec);
_fmpr_vec_get_fmpz_2exp_blocks(xz, NULL, xe, xblocks,
scale, fmprb_midref(x), xmlen, 2, prec, 0);

if (squaring)
{
_fmprb_poly_addmullow_block(z, zz, xz, xe, xblocks, xmlen, xz, xe, xblocks, xmlen, n, prec, 1);
_fmprb_poly_addmullow_block(z, zz,
xz, xe, xblocks, xmlen, xz, xe, xblocks, xmlen, n, prec, 1);
}
else
{
_fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, fmprb_midref(y), ymlen, 2, prec);
_fmprb_poly_addmullow_block(z, zz, xz, xe, xblocks, xmlen, yz, ye, yblocks, ymlen, n, prec, 0);
_fmpr_vec_get_fmpz_2exp_blocks(yz, NULL, ye, yblocks,
scale, fmprb_midref(y), ymlen, 2, prec, 0);

_fmprb_poly_addmullow_block(z, zz,
xz, xe, xblocks, xmlen,
yz, ye, yblocks, ymlen, n, prec, 0);
}
}

Expand Down