In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Apply with Multiple Arguments

In [None]:
height = Table.read_table('galton.csv').select(1, 2, 7).relabeled(2, 'child')
height

In [None]:
t = height.with_column(
    'tallest', height.apply(max, 'father', 'mother', 'child')
)

In [None]:
t.scatter('father', 'tallest')

In [None]:
t.scatter('child', 'tallest')

In [None]:
def abs_diff(x, y):
    return abs(x - y)

diffs = height.apply(abs_diff, 'father', 'mother')
Table().with_column('Height difference', diffs).hist(unit='inch', bins=np.arange(0, 18, 1))

In [None]:
diffs = abs(height.column('father') - height.column('mother'))
Table().with_column('Height difference', diffs).hist(unit='inch', bins=np.arange(0, 18, 1))

## Group

In [None]:
all_cones = Table.read_table('cones.csv')
cones = all_cones.drop('Color').exclude(5)
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.group('Flavor', sum)

In [None]:
cones.where('Flavor', are.equal_to('chocolate')).column('Price')

In [None]:
sum(cones.where('Flavor', are.equal_to('chocolate')).column('Price'))

In [None]:
sum(cones.where('Flavor', are.equal_to('strawberry')).column('Price'))

In [None]:
cones.group('Flavor', max)

In [None]:
cones.group('Flavor', list)

In [None]:
nba = Table.read_table('nba_salaries.csv').relabeled(3, 'SALARY')
nba

In [None]:
teams_and_money = nba.select('TEAM', 'SALARY')
teams_and_money.group('TEAM', sum).sort(1, descending=True).barh('TEAM')

In [None]:
nba.group('POSITION')

In [None]:
positions_and_money = nba.select('POSITION', 'SALARY')
positions_and_money.group('POSITION', np.mean)

In [None]:
nba.group('POSITION', np.mean)

## Group by multiple columns

Note: textbook is out-of-date!

In [None]:
all_cones

In [None]:
all_cones.group('Flavor')

In [None]:
all_cones.group(['Flavor', 'Color'])

In [None]:
all_cones.group(['Flavor', 'Color'], sum)

In [None]:
nba

In [None]:
starters = nba.drop('PLAYER').group(['TEAM', 'POSITION'], max)
starters.drop('POSITION').group('TEAM', sum).sort(1, descending=True)

In [None]:
all_cones.pivot('Flavor', 'Color')

In [None]:
all_cones.pivot('Flavor', 'Color', values='Price', collect=sum)

In [None]:
all_cones.group(['Flavor', 'Color'], sum)

In [None]:
nba.pivot('POSITION', 'TEAM')

In [None]:
nba.pivot('POSITION', 'TEAM', 'SALARY', max)

In [None]:
# Solution to take-home question:
# How do you make a table of the highest paid players for each team & position
indexed = nba.with_column('INDEX', np.arange(nba.num_rows))
def highest_paid(indices):
    return indexed.take(indices).sort('SALARY', descending=True).column('PLAYER').item(0)
indexed.pivot('POSITION', 'TEAM', 'INDEX', highest_paid)

In [None]:
# The solution above does not assume that the nba table is already sorted.
# Since nba is sorted in decreasing order of salary by team, the following works too
def first(players):
    return players.item(0)
indexed.pivot('POSITION', 'TEAM', 'PLAYER', first)

## Example

In [None]:
full_table = Table.read_table('educ_inc.csv')
ca_2014 = full_table.where('Year', are.equal_to('1/1/14 0:00')).where('Age', are.not_equal_to('00 to 17')).drop('Year')
ca_2014

In [None]:
totals = ca_2014.pivot('Educational Attainment', 3, values=4, collect=sum)
totals

In [None]:
totals.select(0, 1, 4).barh(0)