<a href="https://colab.research.google.com/github/gumdropsteve/silent-disco/blob/master/bsql_table_from_pandas_cuDF_and_BlazingSQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Data Check / Download
The below cell will check if you have the data for this demo and download it for you if not.

In [8]:
import os
import urllib

# check for existance of data directory
if not os.path.exists('data/'):
    # it does not, so let us know & create it
    print('creating data directory')
    os.system('mkdir data')

# base url to raw data on GitHub 
base_url = 'https://raw.githubusercontent.com/gumdropsteve/turbo-telegram/master/data/'

# download jan-mar 2015 taxi data
for month in ['jan', 'feb', 'march']:
  fn = f'nyc_taxi_{month}15.csv'
  # check if we already have the file
  if not os.path.isfile(f'data/{fn}'):
      # we don't let me know we're downloading it now
      print(f'Downloading {base_url + fn} to data/{fn}')
      # download file
      urllib.request.urlretrieve(base_url + fn, f'data/{fn}')
  # we already have data
  else:
      # let us know
      print(f'{fn} already downloaded')
    
# identify current working directory & wildcard path to data
cwd = os.getcwd()
data_path = f'{cwd}/data/nyc_taxi_*.csv'

creating blazingsql directory
Downloading https://raw.githubusercontent.com/gumdropsteve/turbo-telegram/master/data/nyc_taxi_jan15.csv to data/nyc_taxi_jan15.csv
Downloading https://raw.githubusercontent.com/gumdropsteve/turbo-telegram/master/data/nyc_taxi_feb15.csv to data/nyc_taxi_feb15.csv
Downloading https://raw.githubusercontent.com/gumdropsteve/turbo-telegram/master/data/nyc_taxi_march15.csv to data/nyc_taxi_march15.csv


## Imports

In [0]:
import cudf
import pandas as pd
from blazingsql import BlazingContext
# connect to BlazingSQL
bc = BlazingContext()

### pandas -> BlazingSQL

In [0]:
%%time
# create pandas DataFrame
df = pd.read_csv('data/nyc_taxi_jan15.csv')

# create BlazingSQL table from pandas DataFrame
bc.create_table('pd_taxi', df)

# query BlazingSQL table & display last 3 rows
bc.sql('SELECT * FROM pd_taxi').tail(3)

### cuDF -> BlazingSQL

In [0]:
%%time
# create pandas DataFrame
df = pd.read_csv('data/nyc_taxi_jan15.csv')

# create BlazingSQL table from pandas DataFrame
bc.create_table('pd_taxi', df)

# query BlazingSQL table & display last 3 rows
bc.sql('SELECT * FROM pd_taxi').tail(3)

### BlazingSQL -> BlazingSQL

In [0]:
%%time
# create BlazingSQL table from 3 CSV files jan-mar 2015
bc.create_table('q1_2015', '/home/winston@blazingdb.com/turbo-telegram/data/nyc_taxi_*.csv', header=0)

# create BlazingSQL table from BlazingSQL query results
bc.create_table('single_10mile', bc.sql('SELECT * FROM q1_2015 WHERE trip_distance > 10 AND passenger_count = 1'))

# pull payment info from january query results table
bc.sql('SELECT payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, total_amount FROM single_10mile')