In [7]:
import sys
from pyspark.sql import SparkSession

In [8]:
# create a spark session
spark = SparkSession.builder\
        .master("local[1]")\
        .appName("spark-app-version-x")\
        .getOrCreate()

# local[1] - run Spark locally with 1 worker thread (i.e. no parallelism at all).
# local[*] - run Spark locally with as many worker threads as logical cores on your machine.

In [9]:
# read taxi data
local_file = 'data/yellow_tripdata_2023-01.parquet'
df = spark.read.parquet(local_file)

In [10]:
# see the columns
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [13]:
# Query sample:
df.select('VendorID', 'total_amount').where('total_amount > 1').show(n=5)

+--------+------------+
|VendorID|total_amount|
+--------+------------+
|       2|        14.3|
|       2|        16.9|
|       2|        34.9|
|       1|       20.85|
|       2|       19.68|
+--------+------------+
only showing top 5 rows



In [14]:
# Query simple, using Spark SQL
df.createOrReplaceTempView("tbl_raw_yellow_taxis")

In [17]:
# SQL STatement
spark.sql('select * from tbl_raw_yellow_taxis where total_amount > 1 and passenger_count > 2').show(n=5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-01-01 00:43:37|  2023-01-01 01:17:18|            4.0|          7.3|       1.0|                 N|          79|         264|           1|       33.8|  3.5|    0.5|      7.7

In [18]:
# stop session
spark.stop()