## Synapse Spark NYC Taxi queries
This notebooks demonstrates how Spark SQL can be used to interact with a data lake. This is the recommended starting point for many data scientists and analysts to work with a data lake.

This notebook and other examples are avilable at https://github.com/datakickstart/synapse_examples

In [0]:
DROP TABLE IF EXISTS synapse_trips_external_csv

In [1]:
CREATE EXTERNAL TABLE synapse_trips_external_csv
(
VendorID int,
tpep_pickup_datetime string,
tpep_dropoff_datetime string,
passenger_count int,
trip_distance float,
RatecodeID int,
store_and_fwd_flag string,
PULocationID int,
DOLocationID int,
payment_type int,
fare_amount float,
extra float,
mta_tax float,
tip_amount int,
tolls_amount int,
improvement_surcharge float,
total_amount float,
congestion_surcharge string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
TBLPROPERTIES ("skip.header.line.count"="1")
LOCATION "abfss://demo@dvtrainingadls.dfs.core.windows.net/nyctaxi/tripdata/yellow/2019"


In [2]:
SELECT count(1) from synapse_trips_external_csv

In [3]:
SELECT * 
FROM synapse_trips_external_csv
WHERE tpep_pickup_datetime between '2019-12-01' and '2019-12-02' 
LIMIT 10

In [4]:
DESCRIBE synapse_trips_external_csv

In [5]:
DROP TABLE IF EXISTS synapse_yellow_trips_delta

In [6]:
CREATE TABLE synapse_yellow_trips_delta USING DELTA PARTITIONED BY (year_month)
Select replace(left(tpep_pickup_datetime, 7),'-','_') as year_month, * from synapse_trips_external_csv

In [7]:
SELECT 
  payment_type,
  count(1) record_count,
  avg(tip_amount) avg_tip
FROM synapse_yellow_trips_delta
WHERE payment_type is not null
and year_month='2018_12'
GROUP BY payment_type
ORDER BY record_count desc