Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 92 additions & 22 deletions datascience/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ def column(self, index_or_label):
... 'letter', make_array('c', 'd'),
... 'count', make_array(2, 4),
... )

>>> tiles.column('letter')
array(['c', 'd'],
dtype='<U1')
Expand Down Expand Up @@ -1003,15 +1004,71 @@ def groups(self, labels, collect=None):
return grouped

def pivot(self, columns, rows, values=None, collect=None, zero=None):
"""Generate a table with a column for rows (or a column for each row
in rows list) and a column for each unique value in columns. Each row
counts/aggregates the values that match both row and column.

columns -- column label in self
rows -- column label or a list of column labels
values -- column label in self (or None to produce counts)
collect -- aggregation function over values
zero -- zero value for non-existent row-column combinations
"""Generate a table with a column for each unique value in ``columns``,
with rows for each unique value in ``rows``. Each row counts/aggregates
the values that match both row and column based on ``collect``.

Args:
``columns`` -- a single column label, (``str``), in table, used to
create new columns, based on its unique values.
``rows`` -- row labels, as (``str``) or array of strings, used to
create new rows based on it's unique values.
``values`` -- column label in table for use in aggregation.
Default None.
``collect`` -- aggregation function, used to group ``values``
over row-column combinations. Default None.
``zero`` -- zero value for non-existent row-column combinations.

Raises:
TypeError -- if ``collect`` is passed in and ``values`` is not,
vice versa.

Returns:
New pivot table, with row-column combinations, as specified, with
aggregated ``values`` by ``collect`` across the intersection of
``columns`` and ``rows``. Simple counts provided if values and
collect are None, as default.

>>> titanic = Table().with_columns('age', make_array(21, 44, 56, 89, 95
... , 40, 80, 45), 'survival', make_array(0,0,0,1, 1, 1, 0, 1),
... 'gender', make_array('M', 'M', 'M', 'M', 'F', 'F', 'F', 'F'),
... 'prediction', make_array(0, 0, 1, 1, 0, 1, 0, 1))
>>> titanic
age | survival | gender | prediction
21 | 0 | M | 0
44 | 0 | M | 0
56 | 0 | M | 1
89 | 1 | M | 1
95 | 1 | F | 0
40 | 1 | F | 1
80 | 0 | F | 0
45 | 1 | F | 1
>>> titanic.pivot('survival', 'gender')
gender | 0 | 1
F | 1 | 3
M | 3 | 1
>>> titanic.pivot('prediction', 'gender')
gender | 0 | 1
F | 2 | 2
M | 2 | 2
>>> titanic.pivot('survival', 'gender', values='age', collect = np.mean)
gender | 0 | 1
F | 80 | 60
M | 40.3333 | 89
>>> titanic.pivot('survival', make_array('prediction', 'gender'))
prediction | gender | 0 | 1
0 | F | 1 | 1
0 | M | 2 | 0
1 | F | 0 | 2
1 | M | 1 | 1
>>> titanic.pivot('survival', 'gender', values = 'age')
Traceback (most recent call last):
...
TypeError: values requires collect to be specified
>>> titanic.pivot('survival', 'gender', collect = np.mean)
Traceback (most recent call last):
...
TypeError: collect requires values to be specified
"""
if collect is not None and values is None:
raise TypeError('collect requires values to be specified')
Expand Down Expand Up @@ -1202,23 +1259,26 @@ def sample(self, k=None, with_replacement=True, weights=None):
"""Returns a new table where k rows are randomly sampled from the
original table.

Kwargs:
k (int or None): If None (default), all the rows in the table are
sampled. If an integer, k rows from the original table are
sampled.
Args:
``k`` -- specifies the number of rows (``int``) to be sampled from
the table. Default is k equal to number of rows in the table.

``with_replacement`` -- (``bool``) By default True; Samples ``k``
rows with replacement from table, else samples ``k`` rows
without replacement.

with_replacement (bool): If True (default), samples the rows with
replacement. If False, samples the rows without replacement.
``weights`` -- Array specifying probability the ith row of the
table is sampled. Defaults to None, which samples each row
with equal probability. ``weights`` must be a valid probability
distribution -- i.e. an array the length of the number of rows,
summing to 1.

weights (list/array or None): If None (default), samples the rows
using a uniform random distribution. If a list/array is passed
in, it must be the same length as the number of rows in the
table and the values must sum to 1. The rows will then be
sampled according the the probability distribution in
``weights``.
Raises:
ValueError -- if ``weights`` is not length equal to number of rows
in the table; or, if ``weights`` does not sum to 1.

Returns:
A new instance of ``Table``.
A new instance of ``Table`` with ``k`` rows resampled.

>>> jobs = Table().with_columns(
... 'job', make_array('a', 'b', 'c', 'd'),
Expand Down Expand Up @@ -1250,6 +1310,16 @@ def sample(self, k=None, with_replacement=True, weights=None):
job | wage
a | 10
a | 10
>>> jobs.sample(k = 2, weights = make_array(1, 0, 1, 0))
Traceback (most recent call last):
...
ValueError: probabilities do not sum to 1

# Weights must be length of table.
>>> jobs.sample(k = 2, weights = make_array(1, 0, 0))
Traceback (most recent call last):
...
ValueError: a and p must have same size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a Raises section to the docstring to document these ValueErrors?

"""
n = self.num_rows
if k is None:
Expand Down