In [1]:
!pip install pyspark



In [2]:
import pyspark
print(pyspark.__version__)

3.5.5


In [3]:
from google.colab import files
uploaded = files.upload()

Saving yellow_tripdata_2024-10.parquet to yellow_tripdata_2024-10.parquet


In [4]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("YellowTaxiOctober2024") \
    .getOrCreate()

In [5]:
df = spark.read.parquet("yellow_tripdata_2024-10.parquet")

In [6]:
df_repartitioned = df.repartition(4)

In [7]:
df_repartitioned.write.mode("overwrite").parquet("yellow_taxi_oct_2024.parquet")

In [9]:
df_out = spark.read.parquet("yellow_taxi_oct_2024.parquet")
df_out.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2024-10-12 04:16:59|  2024-10-12 04:24:37|              2|         2.07|         1|                 N|         125|         100|           1|       10.7|  1.0|    0.5|       2.

In [10]:
import os

# Path to the directory where Parquet files are stored
parquet_dir = "yellow_taxi_oct_2024.parquet"

# List all Parquet files
parquet_files = [f for f in os.listdir(parquet_dir) if f.endswith(".parquet")]

# Get total size in bytes
total_size_bytes = sum(os.path.getsize(os.path.join(parquet_dir, f)) for f in parquet_files)

# Convert to MB
total_size_mb = total_size_bytes / (1024 * 1024)

# Compute average file size
average_size_mb = total_size_mb / len(parquet_files)

print(f"Average Parquet file size: {average_size_mb:.2f} MB")


Average Parquet file size: 23.04 MB


In [11]:
df.printSchema()


root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [12]:
from pyspark.sql.functions import to_date

# Filter trips that started on October 15, 2024
df_filtered = df.filter(to_date(df["tpep_pickup_datetime"]) == "2024-10-15")

# Count the trips
trip_count = df_filtered.count()

print(f"Number of taxi trips on October 15, 2024: {trip_count}")


Number of taxi trips on October 15, 2024: 128893


In [14]:
from pyspark.sql.functions import unix_timestamp

# Convert timestamps to seconds and calculate trip duration in hours
df_with_duration = df.withColumn(
    "trip_duration_hours",
    (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 3600
)

# Find the longest trip duration
longest_trip = df_with_duration.selectExpr("MAX(trip_duration_hours) AS longest_trip_hours").collect()[0][0]

print(f"Longest trip duration: {longest_trip:.2f} hours")


Longest trip duration: 162.62 hours


In [16]:
!wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv

--2025-03-15 22:54:25--  https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 52.85.39.117, 52.85.39.97, 52.85.39.65, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|52.85.39.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12331 (12K) [text/csv]
Saving to: ‘taxi_zone_lookup.csv’


2025-03-15 22:54:26 (208 MB/s) - ‘taxi_zone_lookup.csv’ saved [12331/12331]



In [17]:
zone_df = spark.read.csv("taxi_zone_lookup.csv", header=True, inferSchema=True)
zone_df.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [18]:
df = spark.read.parquet("yellow_taxi_oct_2024.parquet")
df.printSchema()


root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [19]:
df.createOrReplaceTempView("yellow_taxi")
zone_df.createOrReplaceTempView("zones")

In [20]:
query = """
SELECT z.Zone, COUNT(y.PULocationID) AS pickup_count
FROM yellow_taxi y
JOIN zones z ON y.PULocationID = z.LocationID
GROUP BY z.Zone
ORDER BY pickup_count ASC
LIMIT 1
"""

least_frequent_zone = spark.sql(query).collect()[0][0]
print(f"Least frequent pickup location zone: {least_frequent_zone}")


Least frequent pickup location zone: Governor's Island/Ellis Island/Liberty Island
