In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Distinct

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Panda Tea Lounge', 4],
    ['Espresso', 'Gimme',  2],
    ['Latte',    'Gimme',  3],
    ['Espresso', "Cafe Gola",   2]
])
drinks

In [None]:
drinks.sort('Drink', distinct=True)

## Apply

In [None]:
def increase_price_by_percent(price, percent):
    return price * (1 + (percent/100))

def increase_price_by_five_percent(price):
    return increase_price_by_percent(price, 5)

In [None]:
newprice = drinks.apply(increase_price_by_five_percent, 'Price')

In [None]:
drinks.with_column('Price', newprice)

## Group

In [None]:
all_cones = Table.read_table('cones.csv')
#all_cones
cones = all_cones.drop('Color').exclude(5)
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.group('Flavor', sum)
#help(cones.group)

In [None]:
cones.group('Flavor', min)

In [None]:
cones.group('Flavor', list)

## Group by multiple columns

In [None]:
all_cones

In [None]:
all_cones.group(['Flavor', 'Color'])

In [None]:
all_cones.group(['Flavor', 'Color'], min)

## Pivot tables

In [None]:
all_cones.pivot('Flavor', 'Color')

In [None]:
all_cones.pivot('Flavor', 'Color', values='Price', collect=min)

In [None]:
all_cones.group(['Flavor', 'Color'], min)

## Joins

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Panda Tea Lounge', 4],
    ['Espresso', 'Gimme',  2],
    ['Latte',    'Gimme',  3],
    ['Espresso', "Cafe Gola",   2]
])
drinks

In [None]:
discounts = Table().with_columns(
    'Coupon % off', make_array(25, 50, 5),
    'Location', make_array('Panda Tea Lounge', 'Gimme', 'Gimme')
)
discounts

**Q:** Create a table with the discounted price of each drink at each cafe that offers discounts, step by step.

Step 1. Join drinks with discounts

In [None]:
t = drinks.join('Cafe', discounts, 'Location')
t

Step 2. Compute discounts and discard unneccessary columns

In [None]:
t = t.with_column('Discounted', t.column(2) * (1 - t.column(3)/ 100))
t = t.drop('Price', 'Coupon % off')

Step 3. Find the cheapest drink at each cafe.

In [None]:
a.sort('Discounted Price').sort('Cafe', distinct=True) # Correct, Espresso is cheaper

In [None]:
a.group('Cafe', min) # Incorrect answer b/c Coffee is first alphabetically but *not* the cheapest drink at Gimme

## Bikes

From Citi Bike System Data website: https://www.citibikenyc.com/system-data.

In [None]:
citi = Table.read_table('citibike_nyc_201707_250000.csv')
citi

In [None]:
trips = citi.relabeled("start station name", "start")\
            .relabeled("end station name", "end")\
            .select("start", "end")\
            .with_column("duration", citi.column("tripduration")/60)
trips

In [None]:
commute = trips.where('duration', are.below(46))
commute.hist('duration', bins=90, unit='minute')

Where did people pick up bikes?

In [None]:
starts = commute.group('start').sort('count', descending=True)
starts

* West St & Chambers St is near Stuyvesant High School, Borough of Manhattan Community College, and another ferry terminal, on the Hudson greenway bike trail
* 12 Ave & W 40 St is a ferry terminal on the Hudson greenway bike trail
* Pershing Square North is across from Grand Central Station

In [None]:
commute.pivot('start', 'end')

In [None]:
shortest = commute.group(["start", "end"], min)
shortest

Which stations can you get to fastest from Grand Central Station ('Pershing Square North')?

In [None]:
from_gc = shortest.where("start", are.containing('Pershing Square North')).sort(2)
from_gc

## Maps

In [None]:
stations = citi.relabeled("start station name", "name")\
                .relabeled("start station latitude", "lat")\
                .relabeled("start station longitude", "lon")\
                .select("name", "lat", "lon")
stations

How many trips started at each station?

In [None]:
unique_stations = stations.group(["name", "lat", "lon"])
unique_stations

Map it!

In [None]:
Marker.map_table(unique_stations.select("lat", "lon", "name"))

In [None]:
Circle.map_table(unique_stations.select("lat", "lon", "name"), color='blue', radius=10)

In [None]:
blue_stations = unique_stations.with_columns(
    "color", np.full(unique_stations.num_rows, "blue"),
    "radius", np.round(unique_stations.column("count")/100))
blue_stations

In [None]:
Circle.map_table(blue_stations.select("lat", "lon", "name", "color", "radius"))

How long does it take to get to any other station from grand central?

In [None]:
from_gc

In [None]:
gc_dest = from_gc.join("end", unique_stations, "name")
gc_dest

In [None]:
color_from_gc = gc_dest.with_columns(
    "color", np.full(gc_dest.num_rows, "blue"),
    "radius", gc_dest.column("duration min")/5)
color_from_gc

In [None]:
Circle.map_table(color_from_gc.select("lat", "lon", "end", "color", "radius"))

About how long do people spend on trips from each station?

In [None]:
stations = citi.relabeled("start station name", "name")\
                .relabeled("start station latitude", "lat")\
                .relabeled("start station longitude", "lon")\
                .select("name", "lat", "lon")\
                .with_column("duration", citi.column("tripduration")/60)
stations

In [None]:
stations.group(["name", "lat", "lon"], np.median)

In [None]:
def round_median(duration, interval=5):
    """Round the numbers in the array duration to bins of size interval"""
    return np.round(np.median(duration) / interval, 0) * interval

In [None]:
round_median([10,15,5])

In [None]:
round_median([11,15,5])

In [None]:
round_median([11,14,16])

In [None]:
round_median([21,22,5])

In [None]:
duration_by_station = stations.group(["name", "lat", "lon"], round_median)
duration_by_station

In [None]:
duration_by_station = duration_by_station.relabeled("duration round_median", "duration")
duration_by_station

In [None]:
duration_by_station.group('duration')

In [None]:
colors = duration_by_station.group('duration')\
                        .where('duration', are.below(30))\
                        .with_column('color', 
                make_array('blue', 'green', 'purple', 'red', 'orange'))
colors

In [None]:
colored = duration_by_station.join('duration', colors)\
                             .select('lat', 'lon', 'name', 'color')
colored

In [None]:
Marker.map_table(colored)

## Booleans

In [None]:
x = 3
y = 4
y > x


In [None]:
10/2 != y

In [None]:
'Dog' > 'Cat'


In [None]:
'Dog' > 'Catastrophe' > 'Cat'

A. True  
B. False

In [None]:
a = np.arange(11, 50)
a

In [None]:
a > 30

In [None]:
a == 15

In [None]:
def teen(x):
    return 13 <= x <= 19

[teen(10), teen(15), teen(20)]

In [None]:
ages = Table().with_column('Age', a)
ages

In [None]:
ages = ages.with_column('Teenager', ages.apply(teen, 'Age'))
ages

## Combining Booleans

In [None]:
x = 3
y = 4

In [None]:
x < 4 and y > 4

In [None]:
x < 4 or y > 4

In [None]:
not (x < 4 or y > 4)

In [None]:
True + 0

In [None]:
False + 0

In [None]:
int(True)

In [None]:
int(False)

In [None]:
sum([False, True, True, False, True])

A. 0  
B. 1  
C. 2  
D. 3  
E. 4  

In [None]:
ages

In [None]:
ages.column('Teenager')

In [None]:
sum(ages.column('Teenager'))

In [None]:
np.count_nonzero(ages.column('Teenager'))

## Advanced where for Census

In [None]:
# As of Jan 2017, this census file is online here: 
# http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv

full_census_table = Table.read_table('nc-est2015-agesex-res.csv')
partial = full_census_table.select('SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2015')
us_pop = partial.relabeled(2, '2010').relabeled(3, '2015')
us_pop

In [None]:
us_pop.where('AGE', 70)

In [None]:
us_pop.where('AGE', 70).where([False, True, True])

In [None]:
seventy = us_pop.where('AGE', 70)
seventy.column('2010') < 2000000

In [None]:
seventy.where(seventy.column('2010') < 2000000)

In [None]:
us_pop.column('2015') / us_pop.column('2010') > 1.01

In [None]:
us_pop.where(us_pop.column('2015') / us_pop.column('2010') > 1.5)

## Advanced where for bike sharing
Let's used advanced where to answer questions about bike sharing

In [None]:
citi = Table.read_table('citibike_nyc_201707_250000.csv')
trip = citi.relabeled("start station name", "start")\
           .relabeled("end station name", "end")\
           .select("start", "end")\
           .with_column("duration", citi.column("tripduration")/60)\
           .where("duration", are.below(46))
trip.show(3)

What was the average duration of all trips?

In [None]:
np.average(trip.column("duration"))

What was the average duration of trips that started and ended at the same station?

In [None]:
np.average(trip.where(trip.column('start') == trip.column('end')).column('duration'))

What was the average duration of trips that started and ended at different stations?

In [None]:
np.average(trip.where(trip.column('start') != trip.column('end')).column('duration'))