# BigQuery Public Data - Chicago Taxi Trips

### Authenticating with Google BigQuery service account key file

Before connecting to the Jupyter server, do the following in terminal window:

1. export GOOGLE_APPLICATION_CREDENTIALS="/your/file/path/[FILE_NAME].json"
2. pip install --upgrade google-cloud-bigquery

### Set up BigQuery client & load Google Cloud BigQuery extension

In [1]:
from google.cloud import bigquery
client = bigquery.Client()
%load_ext google.cloud.bigquery

### Preliminary Stuff

#### What tables do the chicago taxi trips dataset have?

In [2]:
data_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")
data = client.get_dataset(data_ref)

tables = list(client.list_tables(data))
for table in tables:
    print(table.table_id)

taxi_trips


#### Take a look at the schema

In [3]:
client.get_table(data_ref.table("taxi_trips")).schema

[SchemaField('unique_key', 'STRING', 'REQUIRED', 'Unique identifier for the trip.', ()),
 SchemaField('taxi_id', 'STRING', 'REQUIRED', 'A unique identifier for the taxi.', ()),
 SchemaField('trip_start_timestamp', 'TIMESTAMP', 'NULLABLE', 'When the trip started, rounded to the nearest 15 minutes.', ()),
 SchemaField('trip_end_timestamp', 'TIMESTAMP', 'NULLABLE', 'When the trip ended, rounded to the nearest 15 minutes.', ()),
 SchemaField('trip_seconds', 'INTEGER', 'NULLABLE', 'Time of the trip in seconds.', ()),
 SchemaField('trip_miles', 'FLOAT', 'NULLABLE', 'Distance of the trip in miles.', ()),
 SchemaField('pickup_census_tract', 'INTEGER', 'NULLABLE', 'The Census Tract where the trip began. For privacy, this Census Tract is not shown for some trips.', ()),
 SchemaField('dropoff_census_tract', 'INTEGER', 'NULLABLE', 'The Census Tract where the trip ended. For privacy, this Census Tract is not shown for some trips.', ()),
 SchemaField('pickup_community_area', 'INTEGER', 'NULLABLE', '

#### Sneak peak of the taxi_trips table

In [4]:
table_ref = data_ref.table("taxi_trips")
table = client.get_table(table_ref)
client.list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,51e9ac7992f73fe611a658196e8bdedc309c6a9c,148b56c4e0be7f03cac1f44bfd98d7d7cd18b9935d644a...,2013-04-07 17:00:00+00:00,2013-04-07 17:00:00+00:00,,0.0,,,,,...,0.0,46.86,Credit Card,Chicago Elite Cab Corp.,,,,,,
1,cc1557aa4739cdd8fd2a835b6a5c721bb55387c0,b2b9295effcd71dadf837c7801fd3b8a285f5bdefffb18...,2013-04-07 17:00:00+00:00,2013-04-07 16:45:00+00:00,,0.0,,,,,...,0.0,44.22,Credit Card,Chicago Elite Cab Corp.,,,,,,
2,0777b61b1d52643224b21afb9bac3d6f0dbc26fd,92c7e4860654a8a87e459b0471572be4266569e48af141...,2013-04-07 14:45:00+00:00,2013-04-07 14:30:00+00:00,,0.0,,,,,...,0.0,11.34,Credit Card,Chicago Elite Cab Corp.,,,,,,
3,a0d5f5a0306f3ee3f68e18a88eefff8add909a3b,1ab1d8e0fd730376b1bfaf30fadc2901ba10f5ea75a225...,2013-04-20 18:45:00+00:00,2013-04-20 18:45:00+00:00,,0.0,,,,,...,0.0,10.5,Credit Card,Chicago Elite Cab Corp.,,,,,,
4,02fe736a5c738a58a88fc2fa7ec10ad3f7f979c0,bda6f18f96bd0d6b78f228e56d927386d5b2f5646088d3...,2013-03-30 02:15:00+00:00,2013-03-30 02:15:00+00:00,,0.0,,,,,...,0.0,20.09,Credit Card,Chicago Elite Cab Corp.,,,,,,


### BigQuery SQL

#### Average daily trips, calculated over a window including preceding 15 days and following 15 days (if fit) from 1/1/2016 to 12/31/2017

In [7]:
query = """
        WITH daily_trips AS
        (
            SELECT
                DATE(trip_start_timestamp) AS trip_date,
                COUNT(*) AS num_trips
            FROM
                `bigquery-public-data.chicago_taxi_trips.taxi_trips`
            WHERE
                trip_start_timestamp >= '2016-01-01'
                AND trip_start_timestamp <= '2018-01-01'
            GROUP BY
                trip_date
            ORDER BY
                trip_date
        )
        SELECT
            trip_date,
            AVG(num_trips) OVER(
                               ORDER BY trip_date
                               ROWS BETWEEN 15 PRECEDING AND 15 FOLLOWING
                               ) AS avg_num_trips
            
        FROM
            daily_trips
        """

avg_daily_trips = client.query(query).result().to_dataframe()
avg_daily_trips.head()

Unnamed: 0,trip_date,avg_num_trips
0,2016-01-01,80461.9375
1,2016-01-02,80150.647059
2,2016-01-03,79419.611111
3,2016-01-04,79810.421053
4,2016-01-05,80293.9


#### Separate and order trips by community area
Obtain a table showing pick up area, trip start time, trip end time, and trip sequence on May 4th, 2017.

In [4]:
trip_query = """
             SELECT
                 pickup_community_area AS pick_up_area,
                 trip_start_timestamp AS trip_start_time,
                 trip_end_timestamp AS trip_end_timestamp,
                 RANK() OVER(
                            PARTITION BY pickup_community_area
                            ORDER BY trip_start_timestamp
                            ) AS trip_sequence
             FROM
                 `bigquery-public-data.chicago_taxi_trips.taxi_trips`
             WHERE
                 DATE(trip_start_timestamp) = '2017-05-04'
             """

trips = client.query(trip_query).result().to_dataframe()
trips.head()

Unnamed: 0,pick_up_area,trip_start_time,trip_end_timestamp,trip_sequence
0,27.0,2017-05-04 00:15:00+00:00,2017-05-04 00:30:00+00:00,1
1,27.0,2017-05-04 00:30:00+00:00,2017-05-04 00:30:00+00:00,2
2,27.0,2017-05-04 00:30:00+00:00,2017-05-04 00:30:00+00:00,2
3,27.0,2017-05-04 00:30:00+00:00,2017-05-04 00:30:00+00:00,2
4,27.0,2017-05-04 03:45:00+00:00,2017-05-04 04:15:00+00:00,5


#### How much time elapses between trips?
Obtain a table showing taxi id, trip start/end time, and (break) time **in between** trips on May 4th, 2017.

In [8]:
break_time_query = """
                   SELECT
                       taxi_id,
                       trip_start_timestamp AS start_time,
                       trip_end_timestamp AS end_time,
                       TIMESTAMP_DIFF(
                                     trip_start_timestamp, 
                                     LAG(trip_end_timestamp, 1) 
                                     OVER (PARTITION BY taxi_id ORDER BY trip_start_timestamp),
                                     MINUTE) AS break_time
                    FROM
                       `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                    WHERE
                       Date(trip_start_timestamp) = '2017-05-04'
                   """

break_time = client.query(break_time_query).result().to_dataframe()
break_time.head()

Unnamed: 0,taxi_id,start_time,end_time,break_time
0,0bc9da7719b9f0e78f651c60a77dcaf1129986cf224ba9...,2017-05-04 15:30:00+00:00,2017-05-04 16:45:00+00:00,
1,0bc9da7719b9f0e78f651c60a77dcaf1129986cf224ba9...,2017-05-04 17:00:00+00:00,2017-05-04 17:00:00+00:00,15.0
2,0bc9da7719b9f0e78f651c60a77dcaf1129986cf224ba9...,2017-05-04 17:15:00+00:00,2017-05-04 17:30:00+00:00,15.0
3,0bc9da7719b9f0e78f651c60a77dcaf1129986cf224ba9...,2017-05-04 17:45:00+00:00,2017-05-04 17:45:00+00:00,15.0
4,0bc9da7719b9f0e78f651c60a77dcaf1129986cf224ba9...,2017-05-04 18:00:00+00:00,2017-05-04 18:15:00+00:00,15.0


# End of Session