Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Issue 10322 - ensure RandomSample is initialized before use #1533

Merged
merged 1 commit into from
Sep 26, 2013
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 133 additions & 49 deletions std/random.d
Original file line number Diff line number Diff line change
Expand Up @@ -1827,10 +1827,11 @@ struct RandomSample(Range, UniformRNG = void)
{
private size_t _available, _toSelect;
private enum ushort _alphaInverse = 13; // Vitter's recommended value.
private bool _first, _algorithmA;
private double _Vprime;
private Range _input;
private size_t _index;
private enum Skip { None, A, D };
private Skip _skip = Skip.None;

// If we're using the default thread-local random number generator then
// we shouldn't store a copy of it here. UniformRNG == void is a sentinel
Expand Down Expand Up @@ -1899,7 +1900,23 @@ struct RandomSample(Range, UniformRNG = void)
" items as available when input contains only ",
_input.length));
}
_first = true;
}

private void initializeFront()
{
assert(_skip == Skip.None);
// We can save ourselves a random variate by checking right
// at the beginning if we should use Algorithm A.
if ((_alphaInverse * _toSelect) > _available)
{
_skip = Skip.A;
}
else
{
_skip = Skip.D;
_Vprime = newVprime(_toSelect);
}
prime();
}

/**
Expand All @@ -1917,28 +1934,23 @@ struct RandomSample(Range, UniformRNG = void)
// having it always correspond to the first element of the
// input. The rest of the sample points are determined each
// time we call popFront().
if (_first)
if (_skip == Skip.None)
{
// We can save ourselves a random variate by checking right
// at the beginning if we should use Algorithm A.
if ((_alphaInverse * _toSelect) > _available)
{
_algorithmA = true;
}
else
{
_Vprime = newVprime(_toSelect);
_algorithmA = false;
}
prime();
_first = false;
initializeFront();
}
return _input.front;
}

/// Ditto
void popFront()
{
// First we need to check if the sample has
// been initialized in the first place.
if (_skip == Skip.None)
{
initializeFront();
}

_input.popFront();
--_available;
--_toSelect;
Expand Down Expand Up @@ -1967,11 +1979,42 @@ struct RandomSample(Range, UniformRNG = void)
/**
Returns the index of the visited record.
*/
size_t index()
@property size_t index()
{
if (_skip == Skip.None)
{
initializeFront();
}
return _index;
}

private size_t skip()
{
assert(_skip != Skip.None);

// Step D1: if the number of points still to select is greater
// than a certain proportion of the remaining data points, i.e.
// if n >= alpha * N where alpha = 1/13, we carry out the
// sampling with Algorithm A.
if (_skip == Skip.A)
{
return skipA();
}
else if ((_alphaInverse * _toSelect) > _available)
{
// We shouldn't get here unless the current selected
// algorithm is D.
assert(_skip == Skip.D);
_skip = Skip.A;
return skipA();
}
else
{
assert(_skip == Skip.D);
return skipD();
}
}

/*
Vitter's Algorithm A, used when the ratio of needed sample values
to remaining data values is sufficiently large.
Expand Down Expand Up @@ -2046,28 +2089,21 @@ and its rationale, see:

Variable names are chosen to match those in Vitter's paper.
*/
private size_t skip()
private size_t skipD()
{
// Step D1: if the number of points still to select is greater
// than a certain proportion of the remaining data points, i.e.
// if n >= alpha * N where alpha = 1/13, we carry out the
// sampling with Algorithm A.
if (_algorithmA)
{
return skipA();
}
else if ((_alphaInverse * _toSelect) > _available)
{
_algorithmA = true;
return skipA();
}
// Otherwise, we use the standard Algorithm D mechanism.
else if (_toSelect > 1)
// Confirm that the check in Step D1 is valid and we
// haven't been sent here by mistake
assert((_alphaInverse * _toSelect) <= _available);

// Now it's safe to use the standard Algorithm D mechanism.
if (_toSelect > 1)
{
size_t s;
size_t qu1 = 1 + _available - _toSelect;
double x, y1;

assert(!_Vprime.isNaN);

while (true)
{
// Step D2: set values of x and u.
Expand Down Expand Up @@ -2109,7 +2145,7 @@ Variable names are chosen to match those in Vitter's paper.
limit = qu1;
}

foreach (size_t t; limit.._available)
foreach (size_t t; limit .. _available)
{
y2 *= top/bottom;
top--;
Expand Down Expand Up @@ -2148,7 +2184,10 @@ Variable names are chosen to match those in Vitter's paper.

private void prime()
{
if (empty) return;
if (empty)
{
return;
}
assert(_available && _available >= _toSelect);
immutable size_t s = skip();
assert(s + _toSelect <= _available);
Expand Down Expand Up @@ -2374,33 +2413,78 @@ unittest
auto sample1 = randomSample(TestInputRange(), 654, 654_321);
for (; !sample1.empty; sample1.popFront())
{
assert(sample1.front == sample1.index());
assert(sample1.front == sample1.index);
}

auto sample2 = randomSample(TestInputRange(), 654, 654_321, rng);
for (; !sample2.empty; sample2.popFront())
{
assert(sample2.front == sample2.index());
assert(sample2.front == sample2.index);
}

/* These next 2 tests will fail because of Issue 10322. They
* should be restored to test that this bug has been fixed.
* http://d.puremagic.com/issues/show_bug.cgi?id=10322
/* Check that it also works if .index is called before .front.
* See: http://d.puremagic.com/issues/show_bug.cgi?id=10322
*/
version(none)
auto sample3 = randomSample(TestInputRange(), 654, 654_321);
for (; !sample3.empty; sample3.popFront())
{
auto sample3 = randomSample(TestInputRange(), 654, 654_321);
for (; !sample3.empty; sample3.popFront())
{
assert(sample3.index() == sample3.front);
}
assert(sample3.index == sample3.front);
}

auto sample4 = randomSample(TestInputRange(), 654, 654_321, rng);
for (; !sample4.empty; sample4.popFront())
auto sample4 = randomSample(TestInputRange(), 654, 654_321, rng);
for (; !sample4.empty; sample4.popFront())
{
assert(sample4.index == sample4.front);
}
}

/* Test behaviour if .popFront() is called before sample is read.
* This is a rough-and-ready check that the statistical properties
* are in the ballpark -- not a proper validation of statistical
* quality! This incidentally also checks for reference-type
* initialization bugs, as the foreach() loop will operate on a
* copy of the popFronted (and hence initialized) sample.
*/
{
size_t count0, count1, count99;
foreach(_; 0 .. 100_000)
{
auto sample = randomSample(iota(100), 5);
sample.popFront();
foreach(s; sample)
{
assert(sample4.index() == sample4.front);
if (s == 0)
{
++count0;
}
else if (s == 1)
{
++count1;
}
else if (s == 99)
{
++count99;
}
}
}
/* Statistical assumptions here: this is a sequential sampling process
* so (i) 0 can only be the first sample point, so _can't_ be in the
* remainder of the sample after .popFront() is called. (ii) By similar
* token, 1 can only be in the remainder if it's the 2nd point of the
* whole sample, and hence if 0 was the first; probability of 0 being
* first and 1 second is 5/100 * 4/99 (thank you, Algorithm S:-) and
* so the mean count of 1 should be about 202. Finally, 99 can only
* be the _last_ sample point to be picked, so its probability of
* inclusion should be independent of the .popFront() and it should
* occur with frequency 5/100, hence its count should be about 5000.
* Unfortunately we have to set quite a high tolerance because with
* sample size small enough for unittests to run in reasonable time,
* the variance can be quite high.
*/
assert(count0 == 0);
assert(count1 < 300, text("1: ", count1, " > 300."));
assert(4_700 < count99, text("99: ", count99, " < 4700."));
assert(count99 < 5_300, text("99: ", count99, " > 5300."));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that was always a possibility. This is rather a crude test -- mostly a sanity check -- and in the unittesting scenario we don't have time to run something really statistically rigorous. The upper/lower bounds were set to be quite forgiving, but there's always a possibility of a rare event where we do just get an extreme of variance.

If it repeats frequently then I'd look into it further, but most likely it's just a rare one-off that we should be aware of as possible. I suppose we could make the bounds even more forgiving, but given how many unittest runs have taken place without a failure, I'd rather not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeterministic unit tests are always dangerous. However, let's assume for the moment that there is no better way of writing the tests in question. In that case, the fact that the tests can fail with a small probability should be noted somewhere, and in such a way that it is obvious even to the non-statistically inclined (preferably in the immediate vicinity of the assertions in question).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you like it noted? Via an extra paragraph added to the comment above the asserts? Or in the assert messages themselves?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either of those would be fine. Just make it very obvious to anybody going to the failing line in a text editor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do. Thanks for bringing this to my attention.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW it was obvious.

}

/* Odd corner-cases: RandomSample has 2 constructors that are not called
Expand Down