In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

## Table methods

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Panda Tea Lounge', 4],
    ['Espresso', 'Gimme',  2],
    ['Coffee',    'Gimme',  3],
    ['Espresso', "Cafe Gola",   2]
])
drinks

In [None]:
drinks.sort("Price", distinct=True)

In [None]:
discounts = Table().with_columns(
    'Coupon % off', make_array(25, 50, 5),
    'Location', make_array('Panda Tea Lounge', 'Gimme', 'Gimme')
)
discounts

**Q:** Create a table with the discounted price of each drink at each cafe that offers discounts, step by step.

Step 1. Join drinks with discounts

In [None]:
a = drinks.join('Cafe', discounts, 'Location')
a

Step 2. Compute discounts and discard unneccessary columns

In [None]:
a = a.with_column('Discounted Price', a.column(2) * (1 - a.column(3)/100) )
a = a.drop('Price', 'Coupon % off')
a

Step 3. Find the cheapest drink at each cafe.

In [None]:
a.sort('Discounted Price').sort('Cafe', distinct=True) # Correct, Espresso is cheaper

In [None]:
a.group('Cafe', min) # Incorrect answer b/c Coffee is first alphabetically but *not* the cheapest drink at Gimme

## Advanced Where

In [None]:
3 > 2

In [None]:
1 > 2

In [None]:
3 > 2 and 2 > 1

In [None]:
3 > 2 and 1 > 2

In [None]:
3 > 2 or 1 > 2

In [None]:
not 3 > 2

In [None]:
np.arange(5) > 2

In [None]:
int(True)

In [None]:
int(False)

In [None]:
sum(np.arange(5) > 2)

## Advanced where for Census

In [None]:
# As of Jan 2017, this census file is online here: 
# http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv

full_census_table = Table.read_table('nc-est2015-agesex-res.csv')
partial = full_census_table.select('SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2015')
us_pop = partial.relabeled(2, '2010').relabeled(3, '2015')
us_pop

In [None]:
us_pop.where('AGE', 70)

In [None]:
us_pop.where('AGE', 70).where([False, True, True])

In [None]:
seventy = us_pop.where('AGE', 70)
seventy.column('2010') < 2000000

In [None]:
seventy.where(seventy.column('2010') < 2000000)

In [None]:
us_pop.column('2015') / us_pop.column('2010') > 1.5

In [None]:
us_pop.where(us_pop.column('2015') / us_pop.column('2010') > 1.5)

## Advanced where for bike sharing
Let's used advanced where to answer questions about bike sharing

In [None]:
citi = Table.read_table('citibike_nyc_201707_500000.csv')
trip = citi.relabeled("start station name", "start")\
           .relabeled("end station name", "end")\
           .select("start", "end")\
           .with_column("duration", citi.column("tripduration")/60)\
           .where("duration", are.below(46))
trip.show(3)

What was the average duration of all trips?

In [None]:
np.average(trip.column('duration'))

What was the average duration of trips that started and ended at the same station?

In [None]:
np.average(trip.where(trip.column('start') == trip.column('end')).column('duration'))

What was the average duration of trips that started and ended at different stations?

In [None]:
np.average(trip.where(trip.column('start') != trip.column('end')).column('duration'))

## Example Prelim Question

Suppose you are given the following table of data:

In [None]:
trip.show(3)

What is the name of the station where the most rentals ended? (Assume no ties.)

In [None]:
trip.group('end').sort('count', descending=True).column(0).item(0)

For how many stations was the average duration of a trip ending at that station at least 10 minutes?

In [None]:
trip.group('end', np.average).where('duration average', are.above(10)).num_rows