In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

## Table methods

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Tea One', 4],
    ['Espresso', 'Nefeli',  2],
    ['Coffee',    'Nefeli', 3],
    ['Espresso', "Abe's",   2]
])
drinks

In [None]:
discounts = Table().with_columns(
    'Coupon % off', make_array(5, 50, 25),
    'Location', make_array('Tea One', 'Nefeli', 'Tea One')
)
discounts

In [None]:
a = drinks.join('Cafe', discounts, 'Location')
a = a.with_column('Discounted Price', a.column(2) * (1 - a.column(3)/100) )
a = a.drop(2, 3)
a

In [None]:
a.sort('Discounted Price').sort('Cafe', distinct=True) # Correct, Espresso is cheaper

In [None]:
a.group('Cafe', min) # Incorrect, Coffee is first alphabetically

## Spring 2016 Midterm, Question 2(b)

In [None]:
trip = Table.read_table('trip.csv').where('Duration', are.below(1800)).select(3, 6, 1).relabeled(0, 'Start').relabeled(1, 'End')
trip.show(3)

In [None]:
# The name of the station where the most rentals ended (assume no ties).
trip.group('End').sort('count', descending=True).column(0).item(0)

In [None]:
# The number of stations for which the average duration ending at that station was at least 300 seconds.
trip.group('End', np.average).where(2, are.above(10*60)).num_rows

## Advanced Where

In [None]:
3 > 2

In [None]:
1 > 2

In [None]:
np.arange(5) > 2

In [None]:
# As of Jan 2017, this census file is online here: 
# http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv

full_census_table = Table.read_table('nc-est2015-agesex-res.csv')
partial = full_census_table.select('SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2015')
us_pop = partial.relabeled(2, '2010').relabeled(3, '2015')
us_pop

In [None]:
us_pop.where('AGE', 70)

In [None]:
us_pop.where('AGE', 70).where([False, True, True])

In [None]:
seventy = us_pop.where('AGE', 70)
seventy.column('2010') < 2000000

In [None]:
seventy.where(seventy.column('2010') < 2000000)

In [None]:
us_pop.column('2015') / us_pop.column('2010') > 1.5

In [None]:
us_pop.where(us_pop.column('2015') / us_pop.column('2010') > 1.5)

In [None]:
trip.show(3)

In [None]:
# The average duration of all trips
np.average(trip.column('Duration'))

In [None]:
# The average duration of trips that started and ended at the same station
np.average(trip.where(trip.column('Start') == trip.column('End')).column('Duration'))

In [None]:
# The average duration of trips that started and ended at different stations
np.average(trip.where(trip.column('Start') != trip.column('End')).column('Duration'))

## Census & ZIP Codes

In [None]:
zips = Map.read_geojson('ca_zips.geojson.gz')
zips

In [None]:
zips_table = Table.from_records(zips.features)
zips_table.show(3)

In [None]:
berkeley = zips_table.where('ZIP', are.between('94700', '94799'))
berkeley.show(3)

In [None]:
Map(berkeley.column('feature'))

In [None]:
income_raw = Table.read_table('ca_income_by_zip.csv', dtype={'ZIP': str})
income_by_zipcode = income_raw.drop('STATEFIPS', 'STATE', 'agi_stub').group('ZIP', sum)
income = Table().with_columns(
        'ZIP', income_by_zipcode.column('ZIP'),
        'returns', income_by_zipcode.column('N02650 sum'),
        'total income', income_by_zipcode.column('A02650 sum'),
        'farmers', income_by_zipcode.column('SCHF sum')
    ).where('ZIP', are.not_equal_to('99999'))
income.set_format('total income', NumberFormatter(0)).show(5)

In [None]:
income.sort('total income', descending=True)

In [None]:
Map(zips_table.where('ZIP', '90210').column(1))

In [None]:
income.join('ZIP', zips_table).sort('total income', descending=True)

In [None]:
# Map all of the ZIP codes that have more than $1B returns.
Map(income.where('total income', are.above(1e6)).join('ZIP', zips_table).column('feature'))

In [None]:
# Map all of the ZIP codes that have more than 5% farmers
f = income.where(income.column('farmers') >= 0.05 * income.column('returns'))
Map(f.join('ZIP', zips_table).column('feature'))