diff --git a/datascience/tables.py b/datascience/tables.py index e5ee642b8..92214422e 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -253,6 +253,7 @@ def column(self, index_or_label): ... 'letter', make_array('c', 'd'), ... 'count', make_array(2, 4), ... ) + >>> tiles.column('letter') array(['c', 'd'], dtype='>> titanic = Table().with_columns('age', make_array(21, 44, 56, 89, 95 + ... , 40, 80, 45), 'survival', make_array(0,0,0,1, 1, 1, 0, 1), + ... 'gender', make_array('M', 'M', 'M', 'M', 'F', 'F', 'F', 'F'), + ... 'prediction', make_array(0, 0, 1, 1, 0, 1, 0, 1)) + >>> titanic + age | survival | gender | prediction + 21 | 0 | M | 0 + 44 | 0 | M | 0 + 56 | 0 | M | 1 + 89 | 1 | M | 1 + 95 | 1 | F | 0 + 40 | 1 | F | 1 + 80 | 0 | F | 0 + 45 | 1 | F | 1 + >>> titanic.pivot('survival', 'gender') + gender | 0 | 1 + F | 1 | 3 + M | 3 | 1 + >>> titanic.pivot('prediction', 'gender') + gender | 0 | 1 + F | 2 | 2 + M | 2 | 2 + >>> titanic.pivot('survival', 'gender', values='age', collect = np.mean) + gender | 0 | 1 + F | 80 | 60 + M | 40.3333 | 89 + >>> titanic.pivot('survival', make_array('prediction', 'gender')) + prediction | gender | 0 | 1 + 0 | F | 1 | 1 + 0 | M | 2 | 0 + 1 | F | 0 | 2 + 1 | M | 1 | 1 + >>> titanic.pivot('survival', 'gender', values = 'age') + Traceback (most recent call last): + ... + TypeError: values requires collect to be specified + >>> titanic.pivot('survival', 'gender', collect = np.mean) + Traceback (most recent call last): + ... + TypeError: collect requires values to be specified """ if collect is not None and values is None: raise TypeError('collect requires values to be specified') @@ -1202,23 +1259,26 @@ def sample(self, k=None, with_replacement=True, weights=None): """Returns a new table where k rows are randomly sampled from the original table. - Kwargs: - k (int or None): If None (default), all the rows in the table are - sampled. If an integer, k rows from the original table are - sampled. + Args: + ``k`` -- specifies the number of rows (``int``) to be sampled from + the table. Default is k equal to number of rows in the table. + + ``with_replacement`` -- (``bool``) By default True; Samples ``k`` + rows with replacement from table, else samples ``k`` rows + without replacement. - with_replacement (bool): If True (default), samples the rows with - replacement. If False, samples the rows without replacement. + ``weights`` -- Array specifying probability the ith row of the + table is sampled. Defaults to None, which samples each row + with equal probability. ``weights`` must be a valid probability + distribution -- i.e. an array the length of the number of rows, + summing to 1. - weights (list/array or None): If None (default), samples the rows - using a uniform random distribution. If a list/array is passed - in, it must be the same length as the number of rows in the - table and the values must sum to 1. The rows will then be - sampled according the the probability distribution in - ``weights``. + Raises: + ValueError -- if ``weights`` is not length equal to number of rows + in the table; or, if ``weights`` does not sum to 1. Returns: - A new instance of ``Table``. + A new instance of ``Table`` with ``k`` rows resampled. >>> jobs = Table().with_columns( ... 'job', make_array('a', 'b', 'c', 'd'), @@ -1250,6 +1310,16 @@ def sample(self, k=None, with_replacement=True, weights=None): job | wage a | 10 a | 10 + >>> jobs.sample(k = 2, weights = make_array(1, 0, 1, 0)) + Traceback (most recent call last): + ... + ValueError: probabilities do not sum to 1 + + # Weights must be length of table. + >>> jobs.sample(k = 2, weights = make_array(1, 0, 0)) + Traceback (most recent call last): + ... + ValueError: a and p must have same size """ n = self.num_rows if k is None: