-
Notifications
You must be signed in to change notification settings - Fork 400
/
rand.cpp
594 lines (543 loc) · 22.1 KB
/
rand.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
//----------------------------------------------------------------------
// File: rand.cpp
// Programmer: Sunil Arya and David Mount
// Description: Routines for random point generation
// Last modified: 08/04/06 (Version 1.1.1)
//----------------------------------------------------------------------
// Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
// David Mount. All Rights Reserved.
//
// This software and related documentation is part of the Approximate
// Nearest Neighbor Library (ANN). This software is provided under
// the provisions of the Lesser GNU Public License (LGPL). See the
// file ../ReadMe.txt for further information.
//
// The University of Maryland (U.M.) and the authors make no
// representations about the suitability or fitness of this software for
// any purpose. It is provided "as is" without express or implied
// warranty.
//----------------------------------------------------------------------
// History:
// Revision 0.1 03/04/98
// Initial release
// Revision 0.2 03/26/98
// Changed random/srandom declarations for SGI's.
// Revision 1.0 04/01/05
// annClusGauss centers distributed over [-1,1] rather than [0,1]
// Added annClusOrthFlats distribution
// Changed procedure names to avoid namespace conflicts
// Added annClusFlats distribution
// Added rand/srand option and fixed annRan0() initialization.
// Revision 1.1.1 08/04/06
// Added planted distribution
//----------------------------------------------------------------------
#include "rand.h" // random generator declarations
using namespace std; // make std:: accessible
//----------------------------------------------------------------------
// Globals
//----------------------------------------------------------------------
int annIdum = 0; // used for random number generation
//------------------------------------------------------------------------
// annRan0 - (safer) uniform random number generator
//
// The code given here is taken from "Numerical Recipes in C" by
// William Press, Brian Flannery, Saul Teukolsky, and William
// Vetterling. The task of the code is to do an additional randomizing
// shuffle on the system-supplied random number generator to make it
// safer to use.
//
// Returns a uniform deviate between 0.0 and 1.0 using the
// system-supplied routine random() or rand(). Set the global
// annIdum to any negative value to initialise or reinitialise
// the sequence.
//------------------------------------------------------------------------
double annRan0()
{
const int TAB_SIZE = 97; // table size: any large number
int j;
static double y, v[TAB_SIZE];
static int iff = 0;
const double RAN_DIVISOR = double(ANN_RAND_MAX + 1UL);
if (RAN_DIVISOR < 0) {
cout << "RAN_DIVISOR " << RAN_DIVISOR << endl;
exit(0);
}
//--------------------------------------------------------------------
// As a precaution against misuse, we will always initialize on the
// first call, even if "annIdum" is not set negative. Determine
// "maxran", the next integer after the largest representable value
// of type int. We assume this is a factor of 2 smaller than the
// corresponding value of type unsigned int.
//--------------------------------------------------------------------
if (annIdum < 0 || iff == 0) { // initialize
iff = 1;
ANN_SRAND(annIdum); // (re)seed the generator
annIdum = 1;
for (j = 0; j < TAB_SIZE; j++) // exercise the system routine
ANN_RAND(); // (values intentionally ignored)
for (j = 0; j < TAB_SIZE; j++) // then save TAB_SIZE-1 values
v[j] = ANN_RAND();
y = ANN_RAND(); // generate starting value
}
//--------------------------------------------------------------------
// This is where we start if not initializing. Use the previously
// saved random number y to get an index j between 1 and TAB_SIZE-1.
// Then use the corresponding v[j] for both the next j and as the
// output number.
//--------------------------------------------------------------------
j = int(TAB_SIZE * (y / RAN_DIVISOR));
y = v[j];
v[j] = ANN_RAND(); // refill the table entry
return y / RAN_DIVISOR;
}
//------------------------------------------------------------------------
// annRanInt - generate a random integer from {0,1,...,n-1}
//
// If n == 0, then -1 is returned.
//------------------------------------------------------------------------
static int annRanInt(
int n)
{
int r = (int) (annRan0()*n);
if (r == n) r--; // (in case annRan0() == 1 or n == 0)
return r;
}
//------------------------------------------------------------------------
// annRanUnif - generate a random uniform in [lo,hi]
//------------------------------------------------------------------------
static double annRanUnif(
double lo,
double hi)
{
return annRan0()*(hi-lo) + lo;
}
//------------------------------------------------------------------------
// annRanGauss - Gaussian random number generator
// Returns a normally distributed deviate with zero mean and unit
// variance, using annRan0() as the source of uniform deviates.
//------------------------------------------------------------------------
static double annRanGauss()
{
static int iset=0;
static double gset;
if (iset == 0) { // we don't have a deviate handy
double v1, v2;
double r = 2.0;
while (r >= 1.0) {
//------------------------------------------------------------
// Pick two uniform numbers in the square extending from -1 to
// +1 in each direction, see if they are in the circle of radius
// 1. If not, try again
//------------------------------------------------------------
v1 = annRanUnif(-1, 1);
v2 = annRanUnif(-1, 1);
r = v1 * v1 + v2 * v2;
}
double fac = sqrt(-2.0 * log(r) / r);
//-----------------------------------------------------------------
// Now make the Box-Muller transformation to get two normal
// deviates. Return one and save the other for next time.
//-----------------------------------------------------------------
gset = v1 * fac;
iset = 1; // set flag
return v2 * fac;
}
else { // we have an extra deviate handy
iset = 0; // so unset the flag
return gset; // and return it
}
}
//------------------------------------------------------------------------
// annRanLaplace - Laplacian random number generator
// Returns a Laplacian distributed deviate with zero mean and
// unit variance, using annRan0() as the source of uniform deviates.
//
// prob(x) = b/2 * exp(-b * |x|).
//
// b is chosen to be sqrt(2.0) so that the variance of the Laplacian
// distribution [2/(b^2)] becomes 1.
//------------------------------------------------------------------------
static double annRanLaplace()
{
const double b = 1.4142136;
double laprand = -log(annRan0()) / b;
double sign = annRan0();
if (sign < 0.5) laprand = -laprand;
return(laprand);
}
//----------------------------------------------------------------------
// annUniformPts - Generate uniformly distributed points
// A uniform distribution over [-1,1].
//----------------------------------------------------------------------
void annUniformPts( // uniform distribution
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim) // dimension
{
for (int i = 0; i < n; i++) {
for (int d = 0; d < dim; d++) {
pa[i][d] = (ANNcoord) (annRanUnif(-1,1));
}
}
}
//----------------------------------------------------------------------
// annGaussPts - Generate Gaussian distributed points
// A Gaussian distribution with zero mean and the given standard
// deviation.
//----------------------------------------------------------------------
void annGaussPts( // Gaussian distribution
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
double std_dev) // standard deviation
{
for (int i = 0; i < n; i++) {
for (int d = 0; d < dim; d++) {
pa[i][d] = (ANNcoord) (annRanGauss() * std_dev);
}
}
}
//----------------------------------------------------------------------
// annLaplacePts - Generate Laplacian distributed points
// Generates a Laplacian distribution (zero mean and unit variance).
//----------------------------------------------------------------------
void annLaplacePts( // Laplacian distribution
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim) // dimension
{
for (int i = 0; i < n; i++) {
for (int d = 0; d < dim; d++) {
pa[i][d] = (ANNcoord) annRanLaplace();
}
}
}
//----------------------------------------------------------------------
// annCoGaussPts - Generate correlated Gaussian distributed points
// Generates a Gauss-Markov distribution of zero mean and unit
// variance.
//----------------------------------------------------------------------
void annCoGaussPts( // correlated-Gaussian distribution
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
double correlation) // correlation
{
double std_dev_w = sqrt(1.0 - correlation * correlation);
for (int i = 0; i < n; i++) {
double previous = annRanGauss();
pa[i][0] = (ANNcoord) previous;
for (int d = 1; d < dim; d++) {
previous = correlation*previous + std_dev_w*annRanGauss();
pa[i][d] = (ANNcoord) previous;
}
}
}
//----------------------------------------------------------------------
// annCoLaplacePts - Generate correlated Laplacian distributed points
// Generates a Laplacian-Markov distribution of zero mean and unit
// variance.
//----------------------------------------------------------------------
void annCoLaplacePts( // correlated-Laplacian distribution
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
double correlation) // correlation
{
double wn;
double corr_sq = correlation * correlation;
for (int i = 0; i < n; i++) {
double previous = annRanLaplace();
pa[i][0] = (ANNcoord) previous;
for (int d = 1; d < dim; d++) {
double temp = annRan0();
if (temp < corr_sq)
wn = 0.0;
else
wn = annRanLaplace();
previous = correlation * previous + wn;
pa[i][d] = (ANNcoord) previous;
}
}
}
//----------------------------------------------------------------------
// annClusGaussPts - Generate clusters of Gaussian distributed points
// Cluster centers are uniformly distributed over [-1,1], and the
// standard deviation within each cluster is fixed.
//
// Note: Once cluster centers have been set, they are not changed,
// unless new_clust = true. This is so that subsequent calls generate
// points from the same distribution. It follows, of course, that any
// attempt to change the dimension or number of clusters without
// generating new clusters is asking for trouble.
//
// Note: Cluster centers are not generated by a call to uniformPts().
// Although this could be done, it has been omitted for
// compatibility with annClusGaussPts() in the colored version,
// rand_c.cc.
//----------------------------------------------------------------------
void annClusGaussPts( // clustered-Gaussian distribution
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
int n_clus, // number of colors
ANNbool new_clust, // generate new clusters.
double std_dev) // standard deviation within clusters
{
static ANNpointArray clusters = NULL;// cluster storage
if (clusters == NULL || new_clust) {// need new cluster centers
if (clusters != NULL) // clusters already exist
annDeallocPts(clusters); // get rid of them
clusters = annAllocPts(n_clus, dim);
// generate cluster center coords
for (int i = 0; i < n_clus; i++) {
for (int d = 0; d < dim; d++) {
clusters[i][d] = (ANNcoord) annRanUnif(-1,1);
}
}
}
for (int i = 0; i < n; i++) {
int c = annRanInt(n_clus); // generate cluster index
for (int d = 0; d < dim; d++) {
pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + clusters[c][d]);
}
}
}
//----------------------------------------------------------------------
// annClusOrthFlats - points clustered along orthogonal flats
//
// This distribution consists of a collection points clustered
// among a collection of axis-aligned low dimensional flats in
// the hypercube [-1,1]^d. A set of n_clus orthogonal flats are
// generated, each whose dimension is a random number between 1
// and max_dim. The points are evenly distributed among the clusters.
// For each cluster, we generate points uniformly distributed along
// the flat within the hypercube.
//
// This is done as follows. Each cluster is defined by a d-element
// control vector whose components are either:
//
// CO_FLAG indicating that this component is to be generated
// uniformly in [-1,1],
// x a value other than CO_FLAG in the range [-1,1],
// which indicates that this coordinate is to be
// generated as x plus a Gaussian random deviation
// with the given standard deviation.
//
// The number of zero components is the dimension of the flat, which
// is a random integer in the range from 1 to max_dim. The points
// are disributed between clusters in nearly equal sized groups.
//
// Note: Once cluster centers have been set, they are not changed,
// unless new_clust = true. This is so that subsequent calls generate
// points from the same distribution. It follows, of course, that any
// attempt to change the dimension or number of clusters without
// generating new clusters is asking for trouble.
//
// To make this a bad scenario at query time, query points should be
// selected from a different distribution, e.g. uniform or Gaussian.
//
// We use a little programming trick to generate groups of roughly
// equal size. If n is the total number of points, and n_clus is
// the number of clusters, then the c-th cluster (0 <= c < n_clus)
// is given floor((n+c)/n_clus) points. It can be shown that this
// will exactly consume all n points.
//
// This procedure makes use of the utility procedure, genOrthFlat
// which generates points in one orthogonal flat, according to
// the given control vector.
//
//----------------------------------------------------------------------
const double CO_FLAG = 999; // special flag value
static void genOrthFlat( // generate points on an orthog flat
ANNpointArray pa, // point array
int n, // number of points
int dim, // dimension
double *control, // control vector
double std_dev) // standard deviation
{
for (int i = 0; i < n; i++) { // generate each point
for (int d = 0; d < dim; d++) { // generate each coord
if (control[d] == CO_FLAG) // dimension on flat
pa[i][d] = (ANNcoord) annRanUnif(-1,1);
else // dimension off flat
pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + control[d]);
}
}
}
void annClusOrthFlats( // clustered along orthogonal flats
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
int n_clus, // number of colors
ANNbool new_clust, // generate new clusters.
double std_dev, // standard deviation within clusters
int max_dim) // maximum dimension of the flats
{
static ANNpointArray control = NULL; // control vectors
if (control == NULL || new_clust) { // need new cluster centers
if (control != NULL) { // clusters already exist
annDeallocPts(control); // get rid of them
}
control = annAllocPts(n_clus, dim);
for (int c = 0; c < n_clus; c++) { // generate clusters
int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat
for (int d = 0; d < dim; d++) { // generate side locations
// prob. of picking next dim
double Prob = ((double) n_dim)/((double) (dim-d));
if (annRan0() < Prob) { // add this one to flat
control[c][d] = CO_FLAG; // flag this entry
n_dim--; // one fewer dim to fill
}
else { // don't take this one
control[c][d] = annRanUnif(-1,1);// random value in [-1,1]
}
}
}
}
int offset = 0; // offset in pa array
for (int c = 0; c < n_clus; c++) { // generate clusters
int pick = (n+c)/n_clus; // number of points to pick
// generate the points
genOrthFlat(pa+offset, pick, dim, control[c], std_dev);
offset += pick; // increment offset
}
}
//----------------------------------------------------------------------
// annClusEllipsoids - points clustered around axis-aligned ellipsoids
//
// This distribution consists of a collection points clustered
// among a collection of low dimensional ellipsoids whose axes
// are alligned with the coordinate axes in the hypercube [-1,1]^d.
// The objective is to model distributions in which the points are
// distributed in lower dimensional subspaces, and within this
// lower dimensional space the points are distributed with a
// Gaussian distribution (with no correlation between the
// dimensions).
//
// The distribution is given the number of clusters or "colors"
// (n_clus), maximum number of dimensions (max_dim) of the lower
// dimensional subspace, a "small" standard deviation
// (std_dev_small), and a "large" standard deviation range
// (std_dev_lo, std_dev_hi).
//
// The algorithm generates n_clus cluster centers uniformly from
// the hypercube [-1,1]^d. For each cluster, it selects the
// dimension of the subspace as a random number r between 1 and
// max_dim. These are the dimensions of the ellipsoid. Then it
// generates a d-element std dev vector whose entries are the
// standard deviation for the coordinates of each cluster in the
// distribution. Among the d-element control vector, r randomly
// chosen values are chosen uniformly from the range [std_dev_lo,
// std_dev_hi]. The remaining values are set to std_dev_small.
//
// Note that annClusGaussPts is a special case of this in which
// max_dim = 0, and std_dev = std_dev_small.
//
// If the flag new_clust is set, then new cluster centers are
// generated.
//
// This procedure makes use of the utility procedure genGauss
// which generates points distributed according to a Gaussian
// distribution.
//
//----------------------------------------------------------------------
static void genGauss( // generate points on a general Gaussian
ANNpointArray pa, // point array
int n, // number of points
int dim, // dimension
double *center, // center vector
double *std_dev) // standard deviation vector
{
for (int i = 0; i < n; i++) {
for (int d = 0; d < dim; d++) {
pa[i][d] = (ANNcoord) (std_dev[d]*annRanGauss() + center[d]);
}
}
}
void annClusEllipsoids( // clustered around ellipsoids
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
int n_clus, // number of colors
ANNbool new_clust, // generate new clusters.
double std_dev_small, // small standard deviation
double std_dev_lo, // low standard deviation for ellipses
double std_dev_hi, // high standard deviation for ellipses
int max_dim) // maximum dimension of the flats
{
static ANNpointArray centers = NULL; // cluster centers
static ANNpointArray std_dev = NULL; // standard deviations
if (centers == NULL || new_clust) { // need new cluster centers
if (centers != NULL) // clusters already exist
annDeallocPts(centers); // get rid of them
if (std_dev != NULL) // std deviations already exist
annDeallocPts(std_dev); // get rid of them
centers = annAllocPts(n_clus, dim); // alloc new clusters and devs
std_dev = annAllocPts(n_clus, dim);
for (int i = 0; i < n_clus; i++) { // gen cluster center coords
for (int d = 0; d < dim; d++) {
centers[i][d] = (ANNcoord) annRanUnif(-1,1);
}
}
for (int c = 0; c < n_clus; c++) { // generate cluster std dev
int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat
for (int d = 0; d < dim; d++) { // generate std dev's
// prob. of picking next dim
double Prob = ((double) n_dim)/((double) (dim-d));
if (annRan0() < Prob) { // add this one to ellipse
// generate random std dev
std_dev[c][d] = annRanUnif(std_dev_lo, std_dev_hi);
n_dim--; // one fewer dim to fill
}
else { // don't take this one
std_dev[c][d] = std_dev_small;// use small std dev
}
}
}
}
int offset = 0; // next slot to fill
for (int c = 0; c < n_clus; c++) { // generate clusters
int pick = (n+c)/n_clus; // number of points to pick
// generate the points
genGauss(pa+offset, pick, dim, centers[c], std_dev[c]);
offset += pick; // increment offset in array
}
}
//----------------------------------------------------------------------
// annPlanted - Generates points from a "planted" distribution
// In high dimensional spaces, interpoint distances tend to be
// highly clustered around the mean value. Approximate nearest
// neighbor searching makes little sense in this context, unless it
// is the case that each query point is significantly closer to its
// nearest neighbor than to other points. Thus, the query points
// should be planted close to the data points. Given a source data
// set, this procedure generates a set of query points having this
// property.
//
// We are given a source data array and a standard deviation. We
// generate points as follows. We select a random point from the
// source data set, and we generate a Gaussian point centered about
// this random point and perturbed by a normal distributed random
// variable with mean zero and the given standard deviation along
// each coordinate.
//
// Note that this essentially the same a clustered Gaussian
// distribution, but where the cluster centers are given by the
// source data set.
//----------------------------------------------------------------------
void annPlanted( // planted nearest neighbors
ANNpointArray pa, // point array (modified)
int n, // number of points
int dim, // dimension
ANNpointArray src, // source point array
int n_src, // source size
double std_dev) // standard deviation about source
{
for (int i = 0; i < n; i++) {
int c = annRanInt(n_src); // generate source index
for (int d = 0; d < dim; d++) {
pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + src[c][d]);
}
}
}