In [9]:
# dataviz
import holoviews as hv
from colorcet import fire
from holoviews import opts
from holoviews.element.tiles import EsriImagery
from holoviews.operation.datashader import datashade

# config holoviews
hv.extension('bokeh')
pw = 450
ph = 450
opts.defaults(opts.Scatter(width=pw, height=ph, color='blue'),
              opts.RGB(width=pw, height=ph),
              opts.Curve(width=pw))

In [1]:
import numpy as np
# taxi data
def latlng_to_meters(table, 
                     lat_pickup='pickup_latitude', lng_pickup='pickup_longitude', 
                     lat_dropoff='dropoff_latitude', lng_dropoff='dropoff_longitude'):
    # tag default shift
    origin_shift = 2 * np.pi * 6378137 / 2.0
    # remember columns we're not doing anything with 
    base_columns = '''VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, 
                      trip_distance, RateCodeID, store_and_fwd_flag, payment_type, fare_amount, 
                      extra, mta_tax,  tip_amount, tolls_amount, improvement_surcharge, total_amount'''
    # convert lat/lng coords within focused area to meters and return w/ base columns as pickup/dropoff x/y 
    # origin_shift = 2 * np.pi * 6378137 / 2.0
    # mx = lng * origin_shift / 180.0
    # my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    # my = my * origin_shift / 180.0
    latlng_to_meters = f'''
                       select
                           {lng_pickup} * {origin_shift} / 180.0 AS pickup_x,
                           {lng_dropoff} * {origin_shift} / 180.0 AS dropoff_x,
                           
                           (90 + {lat_pickup}) * {np.pi} / 360.0 AS pickup_y,
                           (90 + {lat_dropoff}) * {np.pi} / 360.0 AS dropoff_y
                       from 
                           {table}
                           where
                                   {lng_pickup} < -73.75
                               and {lng_pickup} > -74.15

                               and {lng_dropoff} < -73.75
                               and {lng_dropoff} > -74.15

                               and {lat_pickup} > 40.68
                               and {lat_pickup} < 40.84

                               and {lat_dropoff} > 40.68
                               and {lat_dropoff} < 40.84
                               '''  # bottom half focuses coords, top half converts to meters & renames
    # run query & output results
    gdf = bc.sql(latlng_to_meters)
    # convert y columns w/ log
    for col in ['pickup_y', 'dropoff_y']:
        gdf[col] = np.log(np.tan(gdf[col])) / (np.pi / 180.0)
    return gdf

In [2]:
%%time
from blazingsql import BlazingContext
# start up BlazingSQL w/o memory pool
bc = BlazingContext(pool=False)

BlazingContext ready
CPU times: user 2.32 s, sys: 532 ms, total: 2.85 s
Wall time: 2.55 s


In [3]:
# %%time
# # copy csv download to new name (to use while downloading)
# !cp yellow_tripdata_2015-01.csv test_taxi.csv

In [4]:
%%time
# create nyc taxi table
bc.create_table('taxi_2015', '/home/jupyter-winston/turbo-telegram/test_taxi.csv', header=0)

CPU times: user 5.72 ms, sys: 6.65 ms, total: 12.4 ms
Wall time: 11.6 ms


<pyblazing.apiv2.context.BlazingTable at 0x7f1bbc7bfd68>

In [5]:
%%time
# how many rows we lookin at?
print((len(bc.sql('select * from taxi_2015'))))
# how's they look?
bc.sql('select * from taxi_2015').tail(2)

12748986
CPU times: user 9.9 s, sys: 5.61 s, total: 15.5 s
Wall time: 15 s


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
12748984,1,2015-01-10 19:01:44,2015-01-10 19:17:03,1,1.3,-73.999565,40.738483,1,N,-73.981819,40.737652,1,10.5,0.0,0.5,2.25,0.0,0.3,13.55
12748985,1,2015-01-10 19:01:45,2015-01-10 19:07:33,1,0.7,-73.96035,40.766399,1,N,-73.968643,40.760777,2,5.5,0.0,0.5,0.0,0.0,0.3,6.3


In [6]:
# gdf = bc.sql('select * from taxi_2015')
# type(gdf['pickup_latitude'][1])

In [7]:
gdf = latlng_to_meters('taxi_2015')

In [8]:
# # save the top 100k for testing
# gdf.head(100000).to_csv('data/test_bsql_converted_taxi.csv')

In [13]:
# read in the csv that was just created 
bc.create_table('test_taxi', '/home/jupyter-winston/turbo-telegram/data/test_bsql_converted_taxi.csv', header=0)
# tag location columns as dataframe
gdf = bc.sql('select pickup_x, pickup_y, dropoff_x, dropoff_y from test_taxi')

In [14]:
try:
    gdf = gdf.to_pandas()
except:
    print('already pandas')
# focus pickup or dropoff location 
loc = 'Pickup Location'
if loc == 'Pickup Location':
    points = hv.Points(gdf, ['pickup_x', 'pickup_y'])
else:
    points = hv.Points(gdf, ['dropoff_x', 'dropoff_y'])
# graph it
taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)
# overlay NYC map & display the graph
map_tiles = EsriImagery().opts(alpha=0.5, width=900, height=480, bgcolor='black')
display(map_tiles * taxi_trips)

already pandas


# these work

In [15]:
# these work
bc.create_table('working_taxi', '/home/jupyter-winston/turbo-telegram/data/nyc_taxi_jan15.csv', header=0)
gdf = bc.sql('select pickup_x, pickup_y, dropoff_x, dropoff_y from working_taxi')
gdf.head()

Unnamed: 0,pickup_x,pickup_y,dropoff_x,dropoff_y
0,-8236963.0,4975553.0,-8234835.0,4975627.0
1,-8237032.0,4974966.0,-8232400.0,4978957.0
2,-8235830.0,4977330.0,-8239167.0,4968526.0
3,-8236971.0,4971216.0,-8239151.0,4969390.0
4,-8236335.0,4973690.0,-8236813.0,4977527.0


In [16]:
try:
    gdf = gdf.to_pandas()
except:
    print('already pandas')
points = hv.Points(gdf, ['pickup_x', 'pickup_y'])
taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)
map_tiles = EsriImagery().opts(alpha=0.5, width=900, height=480, bgcolor='black')
display(map_tiles * taxi_trips)

# these do not work

In [17]:
bc.create_table('test_taxi', '/home/jupyter-winston/turbo-telegram/data/test_bsql_converted_taxi.csv', header=0)
gdf = bc.sql('select pickup_x, pickup_y, dropoff_x, dropoff_y from test_taxi')
gdf.head()

Unnamed: 0,pickup_x,pickup_y,dropoff_x,dropoff_y
0,-8236962.878,44.69615,-8234835.381,44.696819
1,-8237825.768,44.662011,-8237020.631,44.708029
2,-8233561.431,44.765712,-8232278.987,44.794286
3,-8238653.835,44.648255,-8238123.872,44.656393
4,-8234433.662,44.712411,-8238107.735,44.686306


In [18]:
try:
    gdf = gdf.to_pandas()
except:
    print('already pandas')
points = hv.Points(gdf, ['pickup_x', 'pickup_y'])
taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)
map_tiles = EsriImagery().opts(alpha=0.5, width=900, height=480, bgcolor='black')
display(map_tiles * taxi_trips)

# origional conversion 

- not working on set i downloaded 

In [20]:
import pandas as pd
import numpy as np

# Taxi data
def latlng_to_meters(df, lat_name, lng_name):
    lat = df[lat_name]
    lng = df[lng_name]
    origin_shift = 2 * np.pi * 6378137 / 2.0
    mx = lng * origin_shift / 180.0
    my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    my = my * origin_shift / 180.0
    df.loc[:, lng_name] = mx
    df.loc[:, lat_name] = my

# read in test data
df = pd.read_csv('test_taxi.csv')

# filter lat/lng coordinates
print('Filtering Taxi Data')
df = df.loc[(df.pickup_longitude < -73.75) &
            (df.pickup_longitude > -74.15) &
            (df.dropoff_longitude < -73.75) &
            (df.dropoff_longitude > -74.15) &
            (df.pickup_latitude > 40.68) &
            (df.pickup_latitude < 40.84) &
            (df.dropoff_latitude > 40.68) &
            (df.dropoff_latitude < 40.84)].copy()

# convert to meters
print('Reprojecting Taxi Data')
latlng_to_meters(df, 'pickup_latitude', 'pickup_longitude')
latlng_to_meters(df, 'dropoff_latitude', 'dropoff_longitude')
# rename columns
df.rename(columns={'pickup_longitude': 'pickup_x', 
                   'dropoff_longitude': 'dropoff_x',
                   'pickup_latitude': 'pickup_y', 
                   'dropoff_latitude': 'dropoff_y'},
          inplace=True)

# adding this extra
if len(df) > 100000:
    print(f'len(df) = {len(df)}\n')
    df = df.head(100000)
    print(f'len(df) = {len(df)}\n')

Filtering Taxi Data
Reprojecting Taxi Data
len(df) = 11842094

len(df) = 100000



In [21]:
try:
    gdf = gdf.to_pandas()
except:
    print('already pandas')
points = hv.Points(gdf, ['pickup_x', 'pickup_y'])
taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)
map_tiles = EsriImagery().opts(alpha=0.5, width=900, height=480, bgcolor='black')
display(map_tiles * taxi_trips)

already pandas


In [None]:
# 