In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Rows

In [2]:
Table(['Even', 'Odd'])

Even,Odd


In [3]:
Table(['Even', 'Odd']).with_row([2, 3])

Even,Odd
2,3


In [8]:
t = Table(['Even', 'Odd']).with_row([2, 3]).with_row([4, 5])
t

Even,Odd
2,3
4,5


In [14]:
t = t.with_row([6, 7])

In [15]:
t

Even,Odd
2,3
4,5
6,7


In [16]:
t = t.with_row([4, 7])
t

Even,Odd
2,3
4,5
6,7
4,7


In [17]:
t.where('Even', 4)

Even,Odd
4,5
4,7


In [18]:
t.where(0, 4)

Even,Odd
4,5
4,7


In [19]:
t.where(1, are.not_above(5))

Even,Odd
2,3
4,5


In [20]:
t

Even,Odd
2,3
4,5
6,7
4,7


In [21]:
help(are)

Help on class are in module datascience.predicates:

class are(builtins.object)
 |  Predicate functions. The class is named "are" for calls to where.
 |  
 |  For example, given a table, predicates can be used to pick rows as follows.
 |  
 |  >>> from datascience import Table
 |  >>> t = Table().with_columns([
 |  ...    'Sizes', ['S', 'M', 'L', 'XL'],
 |  ...    'Waists', [30, 34, 38, 42],
 |  ... ])
 |  >>> t.where('Sizes',  are.equal_to('L'))
 |  Sizes | Waists
 |  L     | 38
 |  >>> t.where('Waists', are.above(38))
 |  Sizes | Waists
 |  XL    | 42
 |  >>> t.where('Waists', are.above_or_equal_to(38))
 |  Sizes | Waists
 |  L     | 38
 |  XL    | 42
 |  >>> t.where('Waists', are.below(38))
 |  Sizes | Waists
 |  S     | 30
 |  M     | 34
 |  >>> t.where('Waists', are.below_or_equal_to(38))
 |  Sizes | Waists
 |  S     | 30
 |  M     | 34
 |  L     | 38
 |  >>> t.where('Waists', are.strictly_between(30, 38))
 |  Sizes | Waists
 |  M     | 34
 |  >>> t.where('Waists', are.between(30,

#### Discussion

In [23]:
nba = Table.read_table('nba_salaries.csv')
nba.show(20)

PLAYER,POSITION,TEAM,2015-2016 SALARY
Paul Millsap,PF,Atlanta Hawks,18.6717
Al Horford,C,Atlanta Hawks,12.0
Tiago Splitter,C,Atlanta Hawks,9.75625
Jeff Teague,PG,Atlanta Hawks,8.0
Kyle Korver,SG,Atlanta Hawks,5.74648
Thabo Sefolosha,SF,Atlanta Hawks,4.0
Mike Scott,PF,Atlanta Hawks,3.33333
Kent Bazemore,SF,Atlanta Hawks,2.0
Dennis Schroder,PG,Atlanta Hawks,1.7634
Tim Hardaway Jr.,SG,Atlanta Hawks,1.30452


In [24]:
nba = nba.drop('TEAM').relabeled(0, 'NAME').relabeled(2, 'SALARY')
nba

NAME,POSITION,SALARY
Paul Millsap,PF,18.6717
Al Horford,C,12.0
Tiago Splitter,C,9.75625
Jeff Teague,PG,8.0
Kyle Korver,SG,5.74648
Thabo Sefolosha,SF,4.0
Mike Scott,PF,3.33333
Kent Bazemore,SF,2.0
Dennis Schroder,PG,1.7634
Tim Hardaway Jr.,SG,1.30452


Create an array containing the names of all point guards (PG) who make more than $15M/year

In [27]:
nba.where("POSITION", "PG").where("SALARY", are.above(15)).column(0)

array(['Derrick Rose', 'Kyrie Irving', 'Chris Paul', 'Russell Westbrook',
       'John Wall'],
      dtype='<U24')

What does this code return?

In [29]:
nba = nba.with_row(['Samosa', 'Mascot', 100])
nba.where('NAME', are.containing('Samo'))

NAME,POSITION,SALARY
Samosa,Mascot,100


## Census

In [30]:
full = Table.read_table('NC-EST2014-AGESEX-RES.csv')
full

SEX,AGE,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014
0,0,3944153,3944160,3951330,3963071,3926665,3945610,3948350
0,1,3978070,3978090,3957888,3966510,3978006,3943077,3962123
0,2,4096929,4096939,4090862,3971573,3979952,3992690,3957772
0,3,4119040,4119051,4111920,4102501,3983049,3992425,4005190
0,4,4063170,4063186,4077552,4122303,4112638,3994047,4003448
0,5,4056858,4056872,4064653,4087713,4132210,4123408,4004858
0,6,4066381,4066412,4073013,4074979,4097780,4143094,4134352
0,7,4030579,4030594,4043047,4083240,4084964,4108615,4154000
0,8,4046486,4046497,4025604,4053206,4093213,4095827,4119524
0,9,4148353,4148369,4125415,4035769,4063193,4104133,4106832


In [32]:
full.where(1, 0)

SEX,AGE,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014
0,0,3944153,3944160,3951330,3963071,3926665,3945610,3948350
1,0,2014276,2014278,2018420,2028434,2007693,2015537,2017857
2,0,1929877,1929882,1932910,1934637,1918972,1930073,1930493


In [33]:
partial = full.select(['SEX', 'AGE', 4, 8])
partial

SEX,AGE,POPESTIMATE2010,POPESTIMATE2014
0,0,3951330,3948350
0,1,3957888,3962123
0,2,4090862,3957772
0,3,4111920,4005190
0,4,4077552,4003448
0,5,4064653,4004858
0,6,4073013,4134352
0,7,4043047,4154000
0,8,4025604,4119524
0,9,4125415,4106832


In [34]:
simple = partial.relabeled('POPESTIMATE2010', '2010').relabeled(3, '2014')
simple

SEX,AGE,2010,2014
0,0,3951330,3948350
0,1,3957888,3962123
0,2,4090862,3957772
0,3,4111920,4005190
0,4,4077552,4003448
0,5,4064653,4004858
0,6,4073013,4134352
0,7,4043047,4154000
0,8,4025604,4119524
0,9,4125415,4106832


In [35]:
partial.column("AGE")

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 999,   0,   1,
         2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  7

What does AGE 999 mean?

In [37]:
partial.where('AGE', 999)

SEX,AGE,POPESTIMATE2010,POPESTIMATE2014
0,999,309347057,318857056
1,999,152089484,156936487
2,999,157257573,161920569


Which age groups have populations that are changing fastest?

In [None]:
census = simple.with_columns(
        'Change', simple.column(3) - simple.column(2), 
        'Growth', (simple.column(3) / simple.column(2)) ** (1/4) - 1)
census

In [None]:
census.set_format('Growth', PercentFormatter)
census.set_format([2, 3, 4], NumberFormatter)

How do the age distributions of men and women differ?

In [None]:
males = census.where('SEX', 1).where('AGE', are.below(999))
females = census.where('SEX', 2).where('AGE', are.below(999))

In [None]:
females.sort('2014', descending=True).show(5)

In [None]:
males.sort('2014', descending=True).show(5)

In [None]:
Table().with_columns(
    'Age in 2014', females.column('AGE'),
    'Females', females.column('2014'),
    'Males', males.column('2014'),
).plot('Age in 2014')

In [None]:
Table().with_columns(
    'Age in 2014', females.column('AGE'),
    'Ratio', females.column('2014') / males.column('2014'),
).plot('Age in 2014')

In [None]:
females.where('Change', are.above(350000)).sort('AGE').show()

In [None]:
females.where('AGE', are.between(55, 70)).show()

In [None]:
2014 - make_array(67, 66, 65, 64)