## PROJECT 1 - NYC Taxi Data Analysis

This notebook demonstrates how to analyze a sample of NYC Taxi trip data using PySpark and Shapely.  
We will:

1. Load the CSV data
2. Filter out outliers (invalid or overly long trips)  
3. Enrich the data with borough names using GeoJSON and Shapely  
4. Compute several queries:
   - **Query 1**: Taxi utilization  
   - **Query 2**: Average time to find the next fare (per destination borough)  
   - **Query 3**: Number of trips starting and ending in the same borough  
   - **Query 4**: Number of trips that start in one borough and end in another  


In [1]:
%pip install kafka-python

Note: you may need to restart the kernel to use updated packages.


## 1. Imports and Spark Session

In [63]:
import json
# from shapely.geometry import shape, Point

# PySpark imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [64]:
columns = [
    "medallion",
    "hack_license",
    "pickup_datetime",
    "dropoff_datetime",
    "trip_time_in_secs",
    "trip_distance",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
    "payment_type",
    "fare_amount",
    "surcharge",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "total"  # The 17th column
]

df = pd.read_csv(
    "data/sample.csv",
    header=None,
    names=columns,
    nrows=10_000
)

print(df.head())

                          medallion                      hack_license  \
0  5EE2C4D3BF57BDB455E74B03B89E43A7  E96EF8F6E6122591F9465376043B946D   
1  42730E78D8BE872B52598742914DECFF  6016A71F1D29D678E87D36856ED918A7   
2  CA6CD9BAED6A85E430F7BFC0BC84ABD0  77FFDF38272A6006517D53EDA14333E2   
3  15162141EA7436635C696F5BC023D2D6  CDCB7729DE07243726FF7BB0BD5D06BF   
4  025B98A22ED771118FC0EB44A0D3BD9D  7D89374F8E98F30A19F2381EC71A16BA   

       pickup_datetime     dropoff_datetime  trip_time_in_secs  trip_distance  \
0  2013-01-01 00:00:09  2013-01-01 00:00:36                 26           0.10   
1  2013-01-01 00:01:00  2013-01-01 00:01:00                  0           0.01   
2  2013-01-01 00:00:20  2013-01-01 00:01:22                 61           2.20   
3  2013-01-01 00:00:14  2013-01-01 00:01:37                 83           0.20   
4  2013-01-01 00:00:40  2013-01-01 00:01:40                 60           0.30   

   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude 

In [65]:
# Initialize Kafka producer
producer = KafkaProducer(bootstrap_servers='kafka:9092')  # or "localhost:9092" if not in Docker

# Convert each row to JSON and send to Kafka topic "nyc-taxi-raw"
for _, row in df.iterrows():
    row_dict = row.to_dict()  # e.g. {"medallion": "5EE2C4...", "hack_license": "E96EF8F6E6...", ...}
    producer.send("nyc-taxi-clean", json.dumps(row_dict).encode())

producer.flush()
print("✅ Finished sending data to Kafka!")

✅ Finished sending data to Kafka!


In [66]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, from_json, to_timestamp, year, month, dayofmonth

# 1. Define the schema
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", StringType(), True),
    StructField("dropoff_datetime", StringType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", StringType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", StringType(), True),
    StructField("total", DoubleType(), True)
])


In [67]:

# 2. Start Spark

spark = SparkSession.builder \
    .appName("debs_grand_challenge") \
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
        "org.apache.spark:spark-token-provider-kafka-0-10_2.12:3.5.1"
    ) \
    .getOrCreate()

In [68]:

kafka_df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "nyc-taxi-clean") \
    .load()

kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [69]:
kafka_df.head()

Row(key=None, value=bytearray(b'{"medallion": "5EE2C4D3BF57BDB455E74B03B89E43A7", "hack_license": "E96EF8F6E6122591F9465376043B946D", "pickup_datetime": "2013-01-01 00:00:09", "dropoff_datetime": "2013-01-01 00:00:36", "trip_time_in_secs": 26, "trip_distance": 0.1, "pickup_longitude": -73.99221, "pickup_latitude": 40.725124, "dropoff_longitude": -73.991646, "dropoff_latitude": 40.726658, "payment_type": "CSH", "fare_amount": 2.5, "surcharge": 0.5, "mta_tax": "0.50.1", "tip_amount": 0.0, "tolls_amount": "0.00.1", "total": 3.5}'), topic='nyc-taxi-clean', partition=0, offset=0, timestamp=datetime.datetime(2025, 3, 22, 14, 19, 54, 398000), timestampType=0)

In [70]:
raw_str_df = kafka_df.selectExpr("CAST(value AS STRING) AS raw_string")

raw_str_df.show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|raw_string                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [71]:
from pyspark.sql.functions import from_json, col

parsed_df = raw_str_df.select(
    from_json(col("raw_string"), schema).alias("data")
).select("data.*")

parsed_df.show(5, truncate=False)
parsed_df.printSchema()

+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+-----+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total|
+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+-----+
|5EE2C4D3BF57BDB455E74B03B89E43A7|E96EF8F6E6122591F9465376043B946D|2013-01-01 00:00:09|2013-01-01 00:00:36|26               |0.1          |-73.99221       |4

In [72]:
df_clean = parsed_df.dropna()

df_clean = df_clean.filter(
    (col("trip_time_in_secs") > 0) &
    (col("trip_distance") > 0) &
    (col("pickup_longitude") != 0.0) &
    (col("pickup_latitude") != 0.0) &
    (col("dropoff_longitude") != 0.0) &
    (col("dropoff_latitude") != 0.0)
)

In [73]:
df_clean.show(5)

+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+-----+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total|
+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+-----+
|5EE2C4D3BF57BDB45...|E96EF8F6E6122591F...|2013-01-01 00:00:09|2013-01-01 00:00:36|               26|          0.1|       -73.99221|      40.725124|       -73.991646|       40.726658|         CSH|        2.5|      0.5| 0.50.1|   

In [74]:
df_clean.count()

29175

In [75]:
kafka_df.count()

30000

In [76]:


# 6. Add time columns
df_time = df_clean.withColumn(
    "pickup_ts", to_timestamp("pickup_datetime", "yyyy-MM-dd HH:mm:ss")
).withColumn("year", year("pickup_ts")) \
 .withColumn("month", month("pickup_ts")) \
 .withColumn("day", dayofmonth("pickup_ts"))


In [78]:

# 7. Partitioned Parquet Output
df_time.write \
    .partitionBy("year", "month", "day") \
    .mode("overwrite") \
    .parquet("data/kafka_cleaned_partitioned")

spark.stop()
print("✅ Done reading from Kafka, cleaning, and writing partitioned data!")

✅ Done reading from Kafka, cleaning, and writing partitioned data!


In [81]:
df_check = spark.read.parquet("data/kafka_cleaned_partitioned/year=2013/month=1/day=1/part-00000-3b5227ff-fe95-41b2-8463-905a3756fde3.c000.snappy.parquet")
df_check.show(10, truncate=False)
df_check.printSchema()

Py4JJavaError: An error occurred while calling o371.parquet.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:840)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:122)
	at org.apache.spark.SparkContext.defaultParallelism(SparkContext.scala:2702)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.mergeSchemasInParallel(SchemaMergeUtils.scala:63)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:497)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$.inferSchema(ParquetUtils.scala:132)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:79)
	at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:208)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:205)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:407)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:563)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
