This repository has been archived by the owner on Jan 2, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 102
/
bootstrap.py
461 lines (371 loc) · 18.3 KB
/
bootstrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
'''Functions that allow one to create bootstrapped confidence intervals'''
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import numpy as _np
import multiprocessing as _multiprocessing
import scipy.sparse as _sparse
class BootstrapResults(object):
def __init__(self, lower_bound, value, upper_bound):
self.lower_bound = lower_bound
self.upper_bound = upper_bound
self.value = value
if self.lower_bound > self.upper_bound:
self.lower_bound, self.upper_bound = self.upper_bound, self.lower_bound
def __str__(self):
return '{1} ({0}, {2})'.format(self.lower_bound, self.value,
self.upper_bound)
def __repr__(self):
return self.__str__()
def _apply(self, other, func):
return BootstrapResults(func(self.lower_bound, other),
func(self.value, other),
func(self.upper_bound, other))
def __add__(self, other):
return self._apply(float(other), lambda x, other: other + x)
def __radd__(self, other):
return self._apply(float(other), lambda x, other: other + x)
def __sub__(self, other):
return self._apply(float(other), lambda x, other: x - other)
def __rsub__(self, other):
return self._apply(float(other), lambda x, other: other - x)
def __mul__(self, other):
return self._apply(float(other), lambda x, other: x * other)
def __rmul__(self, other):
return self._apply(float(other), lambda x, other: x * other)
def error_width(self):
'''Returns: upper_bound - lower_bound'''
return self.upper_bound - self.lower_bound
def error_fraction(self):
'''Returns the error_width / value'''
if self.value == 0:
return _np.inf
else:
return self.error_width() / self.value
def is_significant(self):
return _np.sign(self.upper_bound) == _np.sign(self.lower_bound)
def get_result(self):
'''Returns:
-1 if statistically significantly negative
+1 if statistically significantly positive
0 otherwise
'''
return int(self.is_significant()) * _np.sign(self.value)
def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal):
'''Get the bootstrap confidence interval for a given distribution.
Args:
bootstrap_distribution: numpy array of bootstrap results from
bootstrap_distribution() or bootstrap_ab_distribution()
stat_val: The overall statistic that this method is attempting to
calculate error bars for.
alpha: The alpha value for the confidence intervals.
is_pivotal: if true, use the pivotal method. if false, use the
percentile method.
'''
if is_pivotal:
low = 2 * stat_val - _np.percentile(bootstrap_dist, 100 * (1 - alpha / 2.))
val = stat_val
high = 2 * stat_val - _np.percentile(bootstrap_dist, 100 * (alpha / 2.))
else:
low = _np.percentile(bootstrap_dist, 100 * (alpha / 2.))
val = _np.percentile(bootstrap_dist, 50)
high = _np.percentile(bootstrap_dist, 100 * (1 - alpha / 2.))
return BootstrapResults(low, val, high)
def _needs_sparse_unification(values_lists):
non_zeros = values_lists[0] != 0
for v in values_lists:
v_nz = v != 0
non_zeros = (non_zeros + v_nz) > 0
non_zero_size = non_zeros.sum()
for v in values_lists:
if non_zero_size != v.data.shape[0]:
return True
return False
def _validate_arrays(values_lists):
t = values_lists[0]
t_type = type(t)
if not isinstance(t, _sparse.csr_matrix) and not isinstance(t, _np.ndarray):
raise ValueError(('The arrays must either be of type '
'scipy.sparse.csr_matrix or numpy.array'))
for _, values in enumerate(values_lists[1:]):
if not isinstance(values, t_type):
raise ValueError('The arrays must all be of the same type')
if t.shape != values.shape:
raise ValueError('The arrays must all be of the same shape')
if isinstance(t, _sparse.csr_matrix):
if values.shape[0] > 1:
raise ValueError(('The sparse matrix must have shape 1 row X N'
' columns'))
if isinstance(t, _sparse.csr_matrix):
if _needs_sparse_unification(values_lists):
raise ValueError(('The non-zero entries in the sparse arrays'
' must be aligned: see '
'bootstrapped.unify_sparse_vectors function'))
def _generate_distributions(values_lists, num_iterations):
if isinstance(values_lists[0], _sparse.csr_matrix):
# in the sparse case we dont actually need to bootstrap
# the full sparse array since most values are 0
# instead for each bootstrap iteration we:
# 1. generate B number of non-zero entries to sample from the
# binomial distribution
# 2. resample with replacement the non-zero entries from values
# B times
# 3. create a new sparse array with the B resamples, zero otherwise
results = [[] for _ in range(len(values_lists))]
pop_size = values_lists[0].shape[1]
non_sparse_size = values_lists[0].data.shape[0]
p = non_sparse_size * 1.0 / pop_size
for _ in range(num_iterations):
ids = _np.random.choice(
non_sparse_size,
_np.random.binomial(pop_size, p),
replace=True,
)
for arr, values in zip(results, values_lists):
data = values.data
d = _sparse.csr_matrix(
(
data[ids],
(_np.zeros_like(ids), _np.arange(len(ids)))
),
shape=(1, pop_size),
)
arr.append(d)
return [_sparse.vstack(r) for r in results]
else:
values_shape = values_lists[0].shape[0]
ids = _np.random.choice(
values_shape,
(num_iterations, values_shape),
replace=True,
)
results = [values[ids] for values in values_lists]
return results
def _bootstrap_sim(values_lists, stat_func_lists, num_iterations,
iteration_batch_size, seed):
'''Returns simulated bootstrap distribution.
See bootstrap() funciton for arg descriptions.
'''
if seed is not None:
_np.random.seed(seed)
num_iterations = int(num_iterations)
iteration_batch_size = int(iteration_batch_size)
results = [[] for _ in values_lists]
for rng in range(0, num_iterations, iteration_batch_size):
max_rng = min(iteration_batch_size, num_iterations - rng)
values_sims = _generate_distributions(values_lists, max_rng)
for i, values_sim, stat_func in zip(range(len(values_sims)), values_sims, stat_func_lists):
results[i].extend(stat_func(values_sim))
return _np.array(results)
def _bootstrap_distribution(values_lists, stat_func_lists,
num_iterations, iteration_batch_size, num_threads):
'''Returns the simulated bootstrap distribution. The idea is to sample the same
indexes in a bootstrap re-sample across all arrays passed into values_lists.
This is especially useful when you want to co-sample records in a ratio metric.
numerator[k].sum() / denominator[k].sum()
and not
numerator[ j ].sum() / denominator[k].sum()
Args:
values_lists: list of numpy arrays (or scipy.sparse.csr_matrix)
each represents a set of values to bootstrap. All arrays in values_lists
must be of the same length.
stat_func_lists: statistic to bootstrap for each element in values_lists.
num_iterations: number of bootstrap iterations / resamples / simulations
to perform.
iteration_batch_size: The bootstrap sample can generate very large
matrices. This argument limits the memory footprint by
batching bootstrap rounds. If unspecified the underlying code
will produce a matrix of len(values) x num_iterations. If specified
the code will produce sets of len(values) x iteration_batch_size
(one at a time) until num_iterations have been simulated.
Defaults to no batching.
num_threads: The number of therads to use. This speeds up calculation of
the bootstrap. Defaults to 1. If -1 is specified then
multiprocessing.cpu_count() is used instead.
Returns:
The set of bootstrap resamples where each stat_function is applied on
the bootsrapped values.
'''
_validate_arrays(values_lists)
if iteration_batch_size is None:
iteration_batch_size = num_iterations
num_iterations = int(num_iterations)
iteration_batch_size = int(iteration_batch_size)
num_threads = int(num_threads)
if num_threads == -1:
num_threads = _multiprocessing.cpu_count()
if num_threads <= 1:
results = _bootstrap_sim(values_lists, stat_func_lists,
num_iterations, iteration_batch_size, None)
else:
pool = _multiprocessing.Pool(num_threads)
iter_per_job = _np.ceil(num_iterations * 1.0 / num_threads)
results = []
for seed in _np.random.randint(0, 2**32 - 1, num_threads):
r = pool.apply_async(_bootstrap_sim, (values_lists, stat_func_lists,
iter_per_job,
iteration_batch_size, seed))
results.append(r)
results = _np.hstack([res.get() for res in results])
pool.close()
return results
def bootstrap(values, stat_func, denominator_values=None, alpha=0.05,
num_iterations=10000, iteration_batch_size=10, is_pivotal=True,
num_threads=1, return_distribution=False):
'''Returns bootstrap estimate.
Args:
values: numpy array (or scipy.sparse.csr_matrix) of values to bootstrap
stat_func: statistic to bootstrap. We provide several default functions:
* stat_functions.mean
* stat_functions.sum
* stat_functions.std
denominator_values: optional array that does division after the
statistic is aggregated. This lets you compute group level division
statistics. One corresponding entry per record in @values.
Example:
SUM(value) / SUM(denom) instead of MEAN(value / denom)
Ex. Cost Per Click
cost per click across a group
SUM(revenue) / SUM(clicks)
mean cost per click for each
MEAN(revenue / clicks)
alpha: alpha value representing the confidence interval.
Defaults to 0.05, i.e., 95th-CI.
num_iterations: number of bootstrap iterations to run. The higher this
number the more sure you can be about the stability your bootstrap.
By this - we mean the returned interval should be consistent across
runs for the same input. This also consumes more memory and makes
analysis slower. Defaults to 10000.
iteration_batch_size: The bootstrap sample can generate very large
matrices. This argument limits the memory footprint by
batching bootstrap rounds. If unspecified the underlying code
will produce a matrix of len(values) x num_iterations. If specified
the code will produce sets of len(values) x iteration_batch_size
(one at a time) until num_iterations have been simulated.
Defaults to 10. Passing None will calculate the full simulation in one step.
is_pivotal: if true, use the pivotal method for bootstrapping confidence
intervals. If false, use the percentile method.
num_threads: The number of therads to use. This speeds up calculation of
the bootstrap. Defaults to 1. If -1 is specified then
multiprocessing.cpu_count() is used instead.
Returns:
BootstrapResults representing CI and estimated value.
'''
if denominator_values is None:
values_lists = [values]
stat_func_lists = [stat_func]
def do_division(x):
return x
stat_val = stat_func(values)[0]
else:
values_lists = [values, denominator_values]
stat_func_lists = [stat_func] * 2
def do_division(num, denom):
return num / denom
stat_val = stat_func(values)[0] / stat_func(denominator_values)[0]
distribution_results = _bootstrap_distribution(values_lists,
stat_func_lists,
num_iterations,
iteration_batch_size,
num_threads)
bootstrap_dist = do_division(*distribution_results)
if return_distribution:
return bootstrap_dist
else:
return _get_confidence_interval(bootstrap_dist, stat_val, alpha,
is_pivotal)
def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None,
ctrl_denominator=None, alpha=0.05, num_iterations=10000,
iteration_batch_size=10, scale_test_by=1.0,
is_pivotal=True, num_threads=1, return_distribution=False):
'''Returns bootstrap confidence intervals for an A/B test.
Args:
test: numpy array (or scipy.sparse.csr_matrix) of test results
ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results
stat_func: statistic to bootstrap. We provide several default functions:
* stat_functions.mean
* stat_functions.sum
* stat_functions.std
compare_func: Function to compare test and control against.
* compare_functions.difference
* compare_functions.percent_change
* compare_functions.ratio
* compare_functions.percent_difference
test_denominator: optional array that does division after the statistic
is aggregated. This lets you compute group level division
statistics. One corresponding entry per record in test.
Example:
SUM(value) / SUM(denom) instead of MEAN(value / denom)
Ex. Cost Per Click
cost per click across a group (clicks is denominator)
SUM(revenue) / SUM(clicks)
mean cost per click for each record
MEAN(revenue / clicks)
ctrl_denominator: see test_denominator.
alpha: alpha value representing the confidence interval.
Defaults to 0.05, i.e., 95th-CI.
num_iterations: number of bootstrap iterations to run. The higher this
number the more sure you can be about the stability your bootstrap.
By this - we mean the returned interval should be consistent across
runs for the same input. This also consumes more memory and makes
analysis slower.
iteration_batch_size: The bootstrap sample can generate very large
arrays. This function iteration_batch_size limits the memory
footprint by batching bootstrap rounds. Defaults to 10. Passing None
will attempt to calculate the full simulation in one step.
scale_test_by: The ratio between test and control population
sizes. Use this if your test and control split is different from a
50/50 split. Defaults to 1.0.
is_pivotal: if true, use the pivotal method for bootstrapping confidence
intervals. If false, use the percentile method.
num_threads: The number of therads to use. This speeds up calculation of
the bootstrap. Defaults to 1. If -1 is specified then
multiprocessing.cpu_count() is used instead.
Returns:
BootstrapResults representing CI and estimated value.
'''
both_denominators = test_denominator is not None and \
ctrl_denominator is not None
both_numerators = test is not None and ctrl is not None
if both_numerators and not both_denominators:
test_lists = [test]
ctrl_lists = [ctrl]
stat_func_lists = [stat_func]
def do_division(x):
return x
test_val = stat_func(test)[0]
ctrl_val = stat_func(ctrl)[0]
elif both_numerators and both_denominators:
test_lists = [test, test_denominator]
ctrl_lists = [ctrl, ctrl_denominator]
stat_func_lists = [stat_func] * 2
def do_division(num, denom):
return num / denom
test_val = stat_func(test)[0] / stat_func(test_denominator)[0]
ctrl_val = stat_func(ctrl)[0] / stat_func(ctrl_denominator)[0]
elif not both_numerators:
raise ValueError('Both test and ctrl numerators must be specified.')
else:
raise ValueError('Both test and ctrl denominators must be specified.')
test_results = _bootstrap_distribution(test_lists, stat_func_lists,
num_iterations, iteration_batch_size,
num_threads)
ctrl_results = _bootstrap_distribution(ctrl_lists, stat_func_lists,
num_iterations, iteration_batch_size,
num_threads)
test_dist = do_division(*test_results)
ctrl_dist = do_division(*ctrl_results)
test_ctrl_dist = compare_func(test_dist * scale_test_by, ctrl_dist)
if return_distribution:
return test_ctrl_dist
else:
test_ctrl_val = compare_func(test_val * scale_test_by, ctrl_val)
return _get_confidence_interval(test_ctrl_dist, test_ctrl_val, alpha,
is_pivotal)