New York City Taxi Data with Time Series
========================================

We read NYC Taxi data from Google Cloud Storage using the parquet format.

### Connect to a cluster on Google Container Engine

In [None]:
from dask.distributed import Client, progress
c = Client()
c

### Load Parquet Data

In [None]:
import gcsfs
import dask.dataframe as dd

df = dd.read_parquet('gcs://anaconda-public-data/nyc-taxi/nyc.parquet')
df = df.persist()
progress(df)

In [None]:
%time df.passenger_count.sum().compute()

In [None]:
%time df.groupby(df.passenger_count).size().compute().sort_index()

In [None]:
%time df.head()  # Fast roundtrip access. Faster than video frame-rate

In [None]:
%time df.loc['2015-05-05'].head()  # Fast random access based on time

### Datetime operations

In [None]:
%matplotlib inline

In [None]:
(df.passenger_count
   .resample('1d')
   .var()
   .compute()
   .plot(title='Passenger Rides Resampled by Day', figsize=(10, 4)));

### Tip Fraction, grouped by day-of-week and hour-of-day

In [None]:
df2 = df[(df.tip_amount > 0) & (df.fare_amount > 0)]
df2['tip_fraction'] = df2.tip_amount / df2.fare_amount

# Group df.tpep_pickup_datetime by hour
hour = df2.groupby(df2.index.dt.hour).tip_fraction.mean().persist()

progress(hour)

### Plot results

In [None]:
from bokeh.plotting import figure, output_notebook, show
output_notebook()

fig = figure(title='Tip Fraction',
             x_axis_label='Hour of day',
             y_axis_label='Tip Fraction',
             height=300)
fig.line(x=hour.index.compute(), y=hour.compute(), line_width=3)
fig.y_range.start = 0

show(fig)