From e67ec80a7822f60aa7223662cf4e56dd3d0e3d6d Mon Sep 17 00:00:00 2001 From: Maxwell Weinstein Date: Sat, 15 Oct 2016 22:41:00 -0700 Subject: [PATCH 1/7] replaced all instances of list usage in docs with make_array --- datascience/tables.py | 161 ++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 83 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index efdad6321..f5c0df0d8 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -31,7 +31,7 @@ class Table(collections.abc.MutableMapping): def __init__(self, labels=None, _deprecated=None, *, formatter=_formats.default_formatter): """Create an empty table with column labels. - >>> tiles = Table(['letter', 'count', 'points']) + >>> tiles = Table(make_array('letter', 'count', 'points')) >>> tiles letter | count | points @@ -324,10 +324,10 @@ def apply(self, fn, column_label=None): A numpy array consisting of results of applying ``fn`` to elements specified by ``column_label`` in each row. - >>> t = Table().with_columns([ - ... 'letter', ['a', 'b', 'c', 'z'], - ... 'count', [9, 3, 3, 1], - ... 'points', [1, 2, 2, 10]]) + >>> t = Table().with_columns( + ... 'letter', make_array('a', 'b', 'c', 'z'), + ... 'count', make_array(9, 3, 3, 1), + ... 'points', make_array(1, 2, 2, 10)) >>> t letter | count | points a | 9 | 1 @@ -403,8 +403,8 @@ def append_column(self, label, values): """Appends a column to the table or replaces a column. ``__setitem__`` is aliased to this method: - ``table.append_column('new_col', [1, 2, 3])`` is equivalent to - ``table['new_col'] = [1, 2, 3]``. + ``table.append_column('new_col', make_array(1, 2, 3))`` is equivalent to + ``table['new_col'] = make_array(1, 2, 3)``. Args: ``label`` (str): The label of the new column. @@ -424,17 +424,17 @@ def append_column(self, label, values): - ``values`` is a list/array and does not have the same length as the number of rows in the table. - >>> table = Table().with_columns([ - ... 'letter', ['a', 'b', 'c', 'z'], - ... 'count', [9, 3, 3, 1], - ... 'points', [1, 2, 2, 10]]) + >>> table = Table().with_columns( + ... 'letter', make_array('a', 'b', 'c', 'z'), + ... 'count', make_array(9, 3, 3, 1), + ... 'points', make_array(1, 2, 2, 10)) >>> table letter | count | points a | 9 | 1 b | 3 | 2 c | 3 | 2 z | 1 | 10 - >>> table.append_column('new_col1', [10, 20, 30, 40]) + >>> table.append_column('new_col1', make_array(10, 20, 30, 40)) >>> table letter | count | points | new_col1 a | 9 | 1 | 10 @@ -448,7 +448,7 @@ def append_column(self, label, values): b | 3 | 2 | 20 | hello c | 3 | 2 | 30 | hello z | 1 | 10 | 40 | hello - >>> table.append_column(123, [1, 2, 3, 4]) + >>> table.append_column(123, make_array(1, 2, 3, 4)) Traceback (most recent call last): ... ValueError: The column label must be a string, but a int was given @@ -495,30 +495,23 @@ def relabel(self, column_label, new_label): Returns: Original table with modified labels - >>> table = Table().with_columns([ + >>> table = Table().with_columns( ... 'points', (1, 2, 3), - ... 'id', (12345, 123, 5123)]) + ... 'id', (12345, 123, 5123)) >>> table.relabel('id', 'yolo') points | yolo 1 | 12345 2 | 123 3 | 5123 - >>> table.relabel(['points', 'yolo'], ['red', 'blue']) + >>> table.relabel(make_array('points', 'yolo'), make_array('red', 'blue')) red | blue 1 | 12345 2 | 123 3 | 5123 - >>> table.relabel(['red', 'green', 'blue'], - ... ['cyan', 'magenta', 'yellow', 'key']) + >>> table.relabel(make_array('red', 'green', 'blue'), make_array('cyan', 'magenta', 'yellow', 'key')) Traceback (most recent call last): ... - ValueError: Invalid arguments. column_label and new_label must be of - equal length. - >>> table.relabel(['red', 'blue'], ['blue', 'red']) - blue | red - 1 | 12345 - 2 | 123 - 3 | 5123 + ValueError: Invalid arguments. column_label and new_label must be of equal length. """ if isinstance(column_label, numbers.Integral): column_label = self._as_label(column_label) @@ -635,10 +628,10 @@ def drop(self, *column_label_or_labels): Returns: An instance of ``Table`` with given columns removed. - >>> t = Table().with_columns([ - ... 'burgers', ['cheeseburger', 'hamburger', 'veggie burger'], - ... 'prices', [6, 5, 5], - ... 'calories', [743, 651, 582]]) + >>> t = Table().with_columns( + ... 'burgers', make_array('cheeseburger', 'hamburger', 'veggie burger'), + ... 'prices', make_array(6, 5, 5), + ... 'calories', make_array(743, 651, 582)) >>> t burgers | prices | calories cheeseburger | 6 | 743 @@ -792,11 +785,11 @@ def sort(self, column_or_label, descending=False, distinct=False): An instance of ``Table`` containing rows sorted based on the values in ``column_or_label``. - >>> marbles = Table().with_columns([ - ... "Color", ["Red", "Green", "Blue", "Red", "Green", "Green"], - ... "Shape", ["Round", "Rectangular", "Rectangular", "Round", "Rectangular", "Round"], - ... "Amount", [4, 6, 12, 7, 9, 2], - ... "Price", [1.30, 1.30, 2.00, 1.75, 1.40, 1.00]]) + >>> marbles = Table().with_columns( + ... "Color", make_array("Red", "Green", "Blue", "Red", "Green", "Green"), + ... "Shape", make_array("Round", "Rectangular", "Rectangular", "Round", "Rectangular", "Round"), + ... "Amount", make_array(4, 6, 12, 7, 9, 2), + ... "Price", make_array(1.30, 1.30, 2.00, 1.75, 1.40, 1.00)) >>> marbles Color | Shape | Amount | Price Red | Round | 4 | 1.3 @@ -868,11 +861,11 @@ def group(self, column_or_label, collect=None): accept arguments with one of the column types, that column will be empty in the resulting table. - >>> marbles = Table().with_columns([ - ... "Color", ["Red", "Green", "Blue", "Red", "Green", "Green"], - ... "Shape", ["Round", "Rectangular", "Rectangular", "Round", "Rectangular", "Round"], - ... "Amount", [4, 6, 12, 7, 9, 2], - ... "Price", [1.30, 1.30, 2.00, 1.75, 1.40, 1.00]]) + >>> marbles = Table().with_columns( + ... "Color", make_array("Red", "Green", "Blue", "Red", "Green", "Green"), + ... "Shape", make_array("Round", "Rectangular", "Rectangular", "Round", "Rectangular", "Round"), + ... "Amount", make_array(4, 6, 12, 7, 9, 2), + ... "Price", make_array(1.30, 1.30, 2.00, 1.75, 1.40, 1.00)) >>> marbles Color | Shape | Amount | Price Red | Round | 4 | 1.3 @@ -949,11 +942,11 @@ def groups(self, labels, collect=None): accept arguments with one of the column types, that column will be empty in the resulting table. - >>> marbles = Table().with_columns([ - ... "Color", ["Red", "Green", "Blue", "Red", "Green", "Green"], - ... "Shape", ["Round", "Rectangular", "Rectangular", "Round", "Rectangular", "Round"], - ... "Amount", [4, 6, 12, 7, 9, 2], - ... "Price", [1.30, 1.30, 2.00, 1.75, 1.40, 1.00]]) + >>> marbles = Table().with_columns( + ... "Color", make_array("Red", "Green", "Blue", "Red", "Green", "Green"), + ... "Shape", make_array("Round", "Rectangular", "Rectangular", "Round", "Rectangular", "Round"), + ... "Amount", make_array(4, 6, 12, 7, 9, 2), + ... "Price", make_array(1.30, 1.30, 2.00, 1.75, 1.40, 1.00)) >>> marbles Color | Shape | Amount | Price Red | Round | 4 | 1.3 @@ -1184,9 +1177,9 @@ def percentile(self, p): pth percentile of a column is the smallest value that at at least as large as the p% of numbers in the column. - >>> table = Table().with_columns([ - ... 'count', [9, 3, 3, 1], - ... 'points', [1, 2, 2, 10]]) + >>> table = Table().with_columns( + ... 'count', make_array(9, 3, 3, 1), + ... 'points', make_array(1, 2, 2, 10)) >>> table count | points 9 | 1 @@ -1222,9 +1215,9 @@ def sample(self, k=None, with_replacement=True, weights=None): Returns: A new instance of ``Table``. - >>> jobs = Table().with_columns([ - ... 'job', ['a', 'b', 'c', 'd'], - ... 'wage', [10, 20, 15, 8]]) + >>> jobs = Table().with_columns( + ... 'job', make_array('a', 'b', 'c', 'd'), + ... 'wage', make_array(10, 20, 15, 8)) >>> jobs job | wage a | 10 @@ -1248,7 +1241,7 @@ def sample(self, k=None, with_replacement=True, weights=None): b | 20 c | 15 >>> jobs.sample(k = 2, with_replacement = True, - ... weights = [0.5, 0.5, 0, 0]) # doctest: +SKIP + ... weights = make_array(0.5, 0.5, 0, 0)) # doctest: +SKIP job | wage a | 10 a | 10 @@ -1306,9 +1299,9 @@ def split(self, k): Returns: A tuple containing two instances of ``Table``. - >>> jobs = Table().with_columns([ - ... 'job', ['a', 'b', 'c', 'd'], - ... 'wage', [10, 20, 15, 8]]) + >>> jobs = Table().with_columns( + ... 'job', make_array('a', 'b', 'c', 'd'), + ... 'wage', make_array(10, 20, 15, 8)) >>> jobs job | wage a | 10 @@ -1348,7 +1341,7 @@ def with_row(self, row): Raises: ``ValueError``: If the row length differs from the column count. - >>> tiles = Table(['letter', 'count', 'points']) + >>> tiles = Table(make_array('letter', 'count', 'points')) >>> tiles.with_row(['c', 2, 3]).with_row(['d', 4, 2]) letter | count | points c | 2 | 3 @@ -1369,8 +1362,9 @@ def with_rows(self, rows): Raises: ``ValueError``: If a row length differs from the column count. - >>> tiles = Table(['letter', 'count', 'points']) - >>> tiles.with_rows([['c', 2, 3], ['d', 4, 2]]) + >>> tiles = Table(make_array('letter', 'count', 'points')) + >>> tiles.with_rows(make_array(make_array('c', 2, 3), + ... make_array('d', 4, 2))) letter | count | points c | 2 | 3 d | 4 | 2 @@ -1523,7 +1517,8 @@ def relabeled(self, label, new_label): columns to be changed. Same number of elements as label. >>> tiles = Table(['letter', 'count']) - >>> tiles = tiles.with_rows([['c', 2], ['d', 4]]) + >>> tiles = tiles.with_rows( + ... make_array(make_array('c', 2), make_array('d', 4))) >>> tiles.relabeled('count', 'number') letter | number c | 2 @@ -1677,9 +1672,9 @@ def to_csv(self, filename): Returns: None, outputs a file with name ``filename``. - >>> jobs = Table().with_columns([ - ... 'job', ['a', 'b', 'c', 'd'], - ... 'wage', [10, 20, 15, 8]]) + >>> jobs = Table().with_columns( + ... 'job', make_array('a', 'b', 'c', 'd'), + ... 'wage', make_array(10, 20, 15, 8)) >>> jobs job | wage a | 10 @@ -1826,11 +1821,11 @@ def barh(self, column_for_categories=None, select=None, overlay=True, **vargs): See http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.barh for additional arguments that can be passed into vargs. - >>> t = Table().with_columns([ - ... 'Furniture', ['chairs', 'tables', 'desks'], - ... 'Count', [6, 1, 2], - ... 'Price', [10, 20, 30] - ... ]) + >>> t = Table().with_columns( + ... 'Furniture', make_array('chairs', 'tables', 'desks'), + ... 'Count', make_array(6, 1, 2), + ... 'Price', make_array(10, 20, 30) + ... ) >>> t Furniture | Count | Price chairs | 6 | 10 @@ -1908,10 +1903,10 @@ def scatter(self, column_for_x, select=None, overlay=True, fit_line=False, ``labels``: A column of text labels to annotate dots - >>> table = Table().with_columns([ - ... 'x', [9, 3, 3, 1], - ... 'y', [1, 2, 2, 10], - ... 'z', [3, 4, 5, 6]]) + >>> table = Table().with_columns( + ... 'x', make_array(9, 3, 3, 1), + ... 'y', make_array(1, 2, 2, 10), + ... 'z', make_array(3, 4, 5, 6)) >>> table x | y | z 9 | 1 | 3 @@ -2056,9 +2051,9 @@ def hist(self, select=None, overlay=True, bins=None, counts=None, unit=None, **v include: `range`, `normed`, `cumulative`, and `orientation`, to name a few. - >>> t = Table().with_columns([ - ... 'count', [9, 3, 3, 1], - ... 'points', [1, 2, 2, 10]]) + >>> t = Table().with_columns( + ... 'count', make_array(9, 3, 3, 1), + ... 'points', make_array(1, 2, 2, 10)) >>> t count | points 9 | 1 @@ -2069,9 +2064,9 @@ def hist(self, select=None, overlay=True, bins=None, counts=None, unit=None, **v - >>> t = Table().with_columns([ - ... 'value', [101, 102, 103], - ... 'proportion', [0.25, 0.5, 0.25]]) + >>> t = Table().with_columns( + ... 'value', make_array(101, 102, 103), + ... 'proportion', make_array(0.25, 0.5, 0.25)) >>> t.hist(counts='value') # doctest: +SKIP """ @@ -2171,9 +2166,9 @@ def boxplot(self, **vargs): Raises: ValueError: The Table contains columns with non-numerical values. - >>> table = Table().with_columns([ - ... 'test1', [92.5, 88, 72, 71, 99, 100, 95, 83, 94, 93], - ... 'test2', [89, 84, 74, 66, 92, 99, 88, 81, 95, 94]]) + >>> table = Table().with_columns( + ... 'test1', make_array(92.5, 88, 72, 71, 99, 100, 95, 83, 94, 93), + ... 'test2', make_array(89, 84, 74, 66, 92, 99, 88, 81, 95, 94)) >>> table test1 | test2 92.5 | 89 @@ -2446,9 +2441,9 @@ def __getitem__(self, row_indices_or_slice): Returns: A new instance of ``Table``. - >>> t = Table().with_columns([ - ... 'letter grade', ['A+', 'A', 'A-', 'B+', 'B', 'B-'], - ... 'gpa', [4, 4, 3.7, 3.3, 3, 2.7]]) + >>> t = Table().with_columns( + ... 'letter grade', make_array('A+', 'A', 'A-', 'B+', 'B', 'B-'), + ... 'gpa', make_array(4, 4, 3.7, 3.3, 3, 2.7)) >>> t letter grade | gpa A+ | 4 @@ -2471,7 +2466,7 @@ def __getitem__(self, row_indices_or_slice): A- | 3.7 B+ | 3.3 B | 3 - >>> t.exclude([1, 3, 4]) + >>> t.exclude(make_array(1, 3, 4)) letter grade | gpa A+ | 4 A- | 3.7 From 5feba1f1e842ea9aa6dcc7396f81622516c91323 Mon Sep 17 00:00:00 2001 From: Maxwell Weinstein Date: Sat, 15 Oct 2016 23:13:44 -0700 Subject: [PATCH 2/7] resolved merge conflicts --- datascience/tables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datascience/tables.py b/datascience/tables.py index f5c0df0d8..77c679ba3 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -251,8 +251,11 @@ def column(self, index_or_label): >>> tiles = Table().with_columns( ... 'letter', make_array('c', 'd'), +<<<<<<< e67ec80a7822f60aa7223662cf4e56dd3d0e3d6d ... 'count', make_array(2, 4), ... ) +======= +>>>>>>> resolved merge conflicts >>> tiles.column('letter') array(['c', 'd'], dtype=' Date: Sat, 15 Oct 2016 23:21:28 -0700 Subject: [PATCH 3/7] fixed last failing test --- datascience/tables.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index 77c679ba3..a177c521d 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -251,11 +251,9 @@ def column(self, index_or_label): >>> tiles = Table().with_columns( ... 'letter', make_array('c', 'd'), -<<<<<<< e67ec80a7822f60aa7223662cf4e56dd3d0e3d6d ... 'count', make_array(2, 4), ... ) -======= ->>>>>>> resolved merge conflicts + >>> tiles.column('letter') array(['c', 'd'], dtype=' Date: Sat, 22 Oct 2016 13:30:50 -0700 Subject: [PATCH 4/7] sample and pivot docs updated --- datascience/tables.py | 95 +++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 22 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index a177c521d..3d8b237a8 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -999,15 +999,58 @@ def groups(self, labels, collect=None): return grouped def pivot(self, columns, rows, values=None, collect=None, zero=None): - """Generate a table with a column for rows (or a column for each row - in rows list) and a column for each unique value in columns. Each row - counts/aggregates the values that match both row and column. - - columns -- column label in self - rows -- column label or a list of column labels - values -- column label in self (or None to produce counts) - collect -- aggregation function over values - zero -- zero value for non-existent row-column combinations + """Generate a table with a column for each unique value in ``columns``, + with rows for each unique value in ``rows``. Each row counts/aggregates + the values that match both row and column based on ``collect``. + + Args: + ``columns`` -- a single column label, (``str``), in self, used to + create new columns, based on its unique values in self. + ``rows`` -- row labels, as (``str``) or list of strings, used to + create new rows based on it's unique values. + ``values`` -- column label in self for use in aggregation. + ``collect`` -- aggregation function, used to group ``values`` + over row-column combinations. + ``zero`` -- zero value for non-existent row-column combinations. + + Returns: + New pivot table, with row-column combinations, as specified, with + aggregated ``values`` by ``collect`` across the intersection of + ``columns`` and ``rows``. Simple counts provided if values/collect + is None, as default. + + >>> titanic = Table().with_columns('age', make_array(21, 44, 56, 89, 95 + ... , 40, 80, 45), 'survival', make_array(0,0,0,1, 1, 1, 0, 1), + ... 'gender', make_array('M', 'M', 'M', 'M', 'F', 'F', 'F', 'F'), + ... 'prediction', make_array(0, 0, 1, 1, 0, 1, 0, 1)) + >>> titanic + age | survival | gender | prediction + 21 | 0 | M | 0 + 44 | 0 | M | 0 + 56 | 0 | M | 1 + 89 | 1 | M | 1 + 95 | 1 | F | 0 + 40 | 1 | F | 1 + 80 | 0 | F | 0 + 45 | 1 | F | 1 + >>> titanic.pivot('survival', 'gender') + gender | 0 | 1 + F | 1 | 3 + M | 3 | 1 + >>> titanic.pivot('prediction', 'gender') + gender | 0 | 1 + F | 2 | 2 + M | 2 | 2 + >>> titanic.pivot('survival', 'gender', values='age', collect = np.mean) + gender | 0 | 1 + F | 80 | 60 + M | 40.3333 | 89 + >>> titanic.pivot('survival', make_array('prediction', 'gender')) + prediction | gender | 0 | 1 + 0 | F | 1 | 1 + 0 | M | 2 | 0 + 1 | F | 0 | 2 + 1 | M | 1 | 1 """ if collect is not None and values is None: raise TypeError('collect requires values to be specified') @@ -1198,23 +1241,21 @@ def sample(self, k=None, with_replacement=True, weights=None): """Returns a new table where k rows are randomly sampled from the original table. - Kwargs: - k (int or None): If None (default), all the rows in the table are - sampled. If an integer, k rows from the original table are - sampled. + Args: + ``k`` -- specifies the number of rows (``int``) to be sampled from + self. Default is k is equal to number of rows in self. - with_replacement (bool): If True (default), samples the rows with - replacement. If False, samples the rows without replacement. + ``with_replacement`` -- (``boolean``), if true samples ``k`` rows + with replacement from self, else samples ``k`` rows without + replacement. - weights (list/array or None): If None (default), samples the rows - using a uniform random distribution. If a list/array is passed - in, it must be the same length as the number of rows in the - table and the values must sum to 1. The rows will then be - sampled according the the probability distribution in - ``weights``. + ``weights``: Array specifying valid probability distribution. + Rows in self are sampled according the the + probability distribution given by ``weights``. Default is + uniform distribution on [1, ... , n], n = number of rows. Returns: - A new instance of ``Table``. + A new instance of ``Table`` with k rows resampled. >>> jobs = Table().with_columns( ... 'job', make_array('a', 'b', 'c', 'd'), @@ -1246,6 +1287,16 @@ def sample(self, k=None, with_replacement=True, weights=None): job | wage a | 10 a | 10 + >>> jobs.sample(k = 2, weights = make_array(1, 0, 1, 0)) + Traceback (most recent call last): + ... + ValueError: probabilities do not sum to 1 + + # Weights must be length of table. + >>> jobs.sample(k = 2, weights = make_array(1, 0, 0)) + Traceback (most recent call last): + ... + ValueError: a and p must have same size """ n = self.num_rows if k is None: From 580e11016687088e1d61d3449034c787cc3b80e3 Mon Sep 17 00:00:00 2001 From: Maxwell Weinstein Date: Sat, 22 Oct 2016 15:04:29 -0700 Subject: [PATCH 5/7] fixed re: comments --- datascience/tables.py | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index 8b309f179..3c7c39e37 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -1009,15 +1009,19 @@ def pivot(self, columns, rows, values=None, collect=None, zero=None): the values that match both row and column based on ``collect``. Args: - ``columns`` -- a single column label, (``str``), in self, used to - create new columns, based on its unique values in self. - ``rows`` -- row labels, as (``str``) or list of strings, used to + ``columns`` -- a single column label, (``str``), in table, used to + create new columns, based on its unique values. + ``rows`` -- row labels, as (``str``) or array of strings, used to create new rows based on it's unique values. - ``values`` -- column label in self for use in aggregation. + ``values`` -- column label in table for use in aggregation. ``collect`` -- aggregation function, used to group ``values`` over row-column combinations. ``zero`` -- zero value for non-existent row-column combinations. + Raises: + TypeError -- if collect is passed in and values is not, + and vice versa. + Returns: New pivot table, with row-column combinations, as specified, with aggregated ``values`` by ``collect`` across the intersection of @@ -1056,6 +1060,14 @@ def pivot(self, columns, rows, values=None, collect=None, zero=None): 0 | M | 2 | 0 1 | F | 0 | 2 1 | M | 1 | 1 + >>> titanic.pivot('survival', 'gender', values = 'age') + Traceback (most recent call last): + ... + TypeError: values requires collect to be specified + >>> titanic.pivot('survival', 'gender', collect = np.mean) + Traceback (most recent call last): + ... + TypeError: collect requires values to be specified """ if collect is not None and values is None: raise TypeError('collect requires values to be specified') @@ -1248,19 +1260,24 @@ def sample(self, k=None, with_replacement=True, weights=None): Args: ``k`` -- specifies the number of rows (``int``) to be sampled from - self. Default is k is equal to number of rows in self. + the table. Default is k is equal to number of rows in the table. - ``with_replacement`` -- (``boolean``), if true samples ``k`` rows - with replacement from self, else samples ``k`` rows without - replacement. + ``with_replacement`` -- (``bool``) By default, TRUE, Samples ``k`` + rows with replacement from table, else samples ``k`` rows + without replacement. - ``weights``: Array specifying valid probability distribution. - Rows in self are sampled according the the - probability distribution given by ``weights``. Default is + ``weights`` -- Array specifying probability the ith row of the + table is sampled. If None, by default, ``weights`` is the uniform distribution on [1, ... , n], n = number of rows. + ``weights`` must be a valid probability distribution -- i.e. + an array the length of the number of rows, summing to 1. + + Raises: + ValueError -- if ``weights`` is not length equal to number of rows + in the table; or, if ``weights`` does not sum to 1. Returns: - A new instance of ``Table`` with k rows resampled. + A new instance of ``Table`` with ``k`` rows resampled. >>> jobs = Table().with_columns( ... 'job', make_array('a', 'b', 'c', 'd'), From 08bc2ee659a9bcee002ee7f3c17204c079848d97 Mon Sep 17 00:00:00 2001 From: Maxwell Weinstein Date: Sat, 22 Oct 2016 15:09:36 -0700 Subject: [PATCH 6/7] fixed re: errors --- datascience/tables.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index 3c7c39e37..fac2fd6c5 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -1014,19 +1014,20 @@ def pivot(self, columns, rows, values=None, collect=None, zero=None): ``rows`` -- row labels, as (``str``) or array of strings, used to create new rows based on it's unique values. ``values`` -- column label in table for use in aggregation. + Default None. ``collect`` -- aggregation function, used to group ``values`` - over row-column combinations. + over row-column combinations. Default None. ``zero`` -- zero value for non-existent row-column combinations. Raises: - TypeError -- if collect is passed in and values is not, - and vice versa. + TypeError -- if ``collect`` is passed in and ``values`` is not, + vice versa. Returns: New pivot table, with row-column combinations, as specified, with aggregated ``values`` by ``collect`` across the intersection of - ``columns`` and ``rows``. Simple counts provided if values/collect - is None, as default. + ``columns`` and ``rows``. Simple counts provided if values and + collect are None, as default. >>> titanic = Table().with_columns('age', make_array(21, 44, 56, 89, 95 ... , 40, 80, 45), 'survival', make_array(0,0,0,1, 1, 1, 0, 1), @@ -1259,10 +1260,10 @@ def sample(self, k=None, with_replacement=True, weights=None): original table. Args: - ``k`` -- specifies the number of rows (``int``) to be sampled from - the table. Default is k is equal to number of rows in the table. + ``k`` -- specifies the number of rows (``int``) to be sampled from + the table. Default is k equal to number of rows in the table. - ``with_replacement`` -- (``bool``) By default, TRUE, Samples ``k`` + ``with_replacement`` -- (``bool``) By default TRUE; Samples ``k`` rows with replacement from table, else samples ``k`` rows without replacement. From 6bd1776961bd6059c8c8d2a52ab2fe5be63600ad Mon Sep 17 00:00:00 2001 From: Maxwell Weinstein Date: Sun, 23 Oct 2016 09:01:21 -0700 Subject: [PATCH 7/7] fixed weights --- datascience/tables.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index fac2fd6c5..92214422e 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -1263,15 +1263,15 @@ def sample(self, k=None, with_replacement=True, weights=None): ``k`` -- specifies the number of rows (``int``) to be sampled from the table. Default is k equal to number of rows in the table. - ``with_replacement`` -- (``bool``) By default TRUE; Samples ``k`` + ``with_replacement`` -- (``bool``) By default True; Samples ``k`` rows with replacement from table, else samples ``k`` rows without replacement. ``weights`` -- Array specifying probability the ith row of the - table is sampled. If None, by default, ``weights`` is the - uniform distribution on [1, ... , n], n = number of rows. - ``weights`` must be a valid probability distribution -- i.e. - an array the length of the number of rows, summing to 1. + table is sampled. Defaults to None, which samples each row + with equal probability. ``weights`` must be a valid probability + distribution -- i.e. an array the length of the number of rows, + summing to 1. Raises: ValueError -- if ``weights`` is not length equal to number of rows