# Module 6 Homework — Apache Spark
Data Engineering Zoomcamp 2026

Dataset: Yellow Taxi November 2025

## Setup

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('homework6') \
    .getOrCreate()

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

## Question 1 — Spark version

In [None]:
print(spark.version)

## Question 2 — Average parquet file size
Read November 2025 Yellow Taxi data, repartition to 4 and save as parquet.

In [None]:
df = spark.read.parquet('data/yellow_tripdata_2025-11.parquet')
print(f'Total rows: {df.count():,}')
df.printSchema()

In [None]:
df.repartition(4).write.parquet('data/yellow_2025_11_repartitioned', mode='overwrite')

In [None]:
import os

output_dir = 'data/yellow_2025_11_repartitioned'
parquet_files = [f for f in os.listdir(output_dir) if f.endswith('.parquet')]
sizes_mb = [os.path.getsize(os.path.join(output_dir, f)) / (1024 * 1024) for f in parquet_files]

print(f'Number of parquet files: {len(parquet_files)}')
print(f'File sizes (MB): {[round(s, 2) for s in sizes_mb]}')
print(f'Average size: {sum(sizes_mb)/len(sizes_mb):.2f} MB')

## Question 3 — Trips on November 15th

In [None]:
count_nov15 = df.filter(
    (F.to_date(F.col('tpep_pickup_datetime')) == '2025-11-15')
).count()

print(f'Trips on November 15th: {count_nov15:,}')

## Question 4 — Longest trip in hours

In [None]:
df_duration = df.withColumn(
    'duration_hours',
    (F.unix_timestamp('tpep_dropoff_datetime') - F.unix_timestamp('tpep_pickup_datetime')) / 3600
)

max_duration = df_duration.agg(F.max('duration_hours')).collect()[0][0]
print(f'Longest trip: {max_duration:.1f} hours')

## Question 5 — Spark UI port

The Spark UI runs on port **4040**.

Access it at: http://localhost:4040

## Question 6 — Least frequent pickup zone

In [None]:
# Load zone lookup
zones = spark.read.option('header', 'true').csv('data/taxi_zone_lookup.csv')
zones.show(5)

In [None]:
# Count pickups per zone and join with zone names
pickup_counts = df.groupBy('PULocationID').count()

result = pickup_counts \
    .join(zones, pickup_counts['PULocationID'] == zones['LocationID'], 'left') \
    .select('Zone', 'count') \
    .orderBy('count') \
    .limit(10)

result.show(truncate=False)

In [None]:
spark.stop()