In [1]:
# basic packages
import pandas as pd
import numpy as np

# bigquery connection
from google.cloud import bigquery
from google.oauth2 import service_account

In [2]:
file = './My First Project-5abe6994fdf1.json'
credentials = service_account.Credentials.from_service_account_file(file)
project_id = 'keen-bucksaw-273300'
client = bigquery.Client(credentials= credentials,project=project_id)

# Plotly mapbox public token
mapbox_access_token = "pk.eyJ1IjoicGxvdGx5bWFwYm94IiwiYSI6ImNqdnBvNDMyaTAxYzkzeW5ubWdpZ2VjbmMifQ.TXcBE-xg9BFdV2ocecc_7g"

In [6]:
# querying downsampled data

years = [2013, 2014, 2015, 2016, 2017, 2018]

for year in years:
    start = "'" + str(year) + "-01-01'"
    end = "'" + str(year) + "-12-31'"
    
    query = f"""

    SELECT *
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE rand() < 1/10
    AND starttime between {start} and {end}

    """

    query_job = client.query(query)
    results = query_job.result() 
    df = results.to_dataframe()
    df.to_csv(f'./data/initial/{str(year)}.csv', index = False)

In [3]:
# query for bikerides

query = """
with a as (

    select start_station_id
    , end_station_id
    , start_station_latitude
    , start_station_longitude
    , end_station_latitude
    , end_station_longitude
    , tripduration
    , bikeid
    , starttime
    , stoptime
    , birth_year
    , gender
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    where start_station_id is not null
    and end_station_id is not null
    and birth_year is not null  
    
), b as (

    select start_station_id
    , end_station_id
    , tripduration
    , bikeid
    , starttime
    , stoptime
    , birth_year
    , gender
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    where start_station_id is not null
    and end_station_id is not null
    and birth_year is not null
)

select a.start_station_id
, a.end_station_id
, a.start_station_latitude
, a.start_station_longitude
, a.end_station_latitude
, a.end_station_longitude
, a.starttime as a_start
, b.starttime as b_start
, a.stoptime as a_stop
, b.stoptime as b_stop
, a.tripduration as a_duration
, b.tripduration as b_duration
, a.birth_year as a_yob
, a.gender as a_gender
, b.birth_year as b_yob
, b.gender as b_gender
from a join b on a.start_station_id = b.start_station_id
		and a.end_station_id = b.end_station_id
		and a.bikeid <> b.bikeid
        and a.starttime > b.starttime
		and timestamp_diff(timestamp(a.starttime), timestamp(b.starttime), second) <= 30 
		and abs(timestamp_diff(timestamp(a.stoptime), timestamp(b.stoptime), second)) <= 30  
"""

query_job = client.query(query)
results = query_job.result() 
df = results.to_dataframe()
df.to_csv('./data/initial/bike_rides.csv', index = False)

In [4]:
# query for commuters

query = """

with a as(

    select start_station_id
    , start_station_latitude
    , start_station_longitude
    , end_station_id
    , end_station_latitude
    , end_station_longitude
    , starttime
    , stoptime
    , tripduration
    , birth_year
    , gender
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    where start_station_id is not null
    and end_station_id is not null
    and birth_year is not null
    and tripduration > 120
), b as (

    select start_station_id
    , end_station_id
    , starttime
    , stoptime
    , tripduration
    , birth_year
    , gender
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    where start_station_id is not null
    and end_station_id is not null
    and birth_year is not null
    and tripduration > 120
)

select a.start_station_id as home
, a.start_station_latitude
, a.start_station_longitude
, a.end_station_id as work
, a.end_station_latitude
, a.end_station_longitude
, a.starttime as depart_home
, a.stoptime as arrive_work
, a.tripduration as commute_in
, b.starttime as depart_work
, b.stoptime as arrive_home
, b.tripduration as commute_out
from a join b on a.start_station_id = b.end_station_id
and a.end_station_id = b.start_station_id
and timestamp_trunc(timestamp(a.starttime), day, 'UTC') = timestamp_trunc(timestamp(b.starttime), day, 'UTC')
and a.starttime < b.starttime
and a.birth_year = b.birth_year
and a.gender = b.gender
and a.start_station_id <> b.start_station_id
and timestamp_diff(timestamp(b.starttime), timestamp(a.stoptime), hour) >= 6
"""

query_job = client.query(query)
results = query_job.result() 
df = results.to_dataframe()
df.to_csv('./data/initial/commuters.csv', index = False)