In [None]:
!pip install pyspark kafka-python delta-spark

In [None]:
#Importing all the necessary libraries
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_timestamp, year, month, dayofmonth, lit, udf, floor, concat, lit, window, desc, expr, current_timestamp
from datetime import timedelta, datetime
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from datetime import datetime

import os
import math
import time

In [None]:
builder = SparkSession.builder.appName("project2_debs_grand_challenge") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
columns = [
    "medallion", "hack_license", "pickup_datetime", "dropoff_datetime",
    "trip_time_in_secs", "trip_distance", "pickup_longitude", "pickup_latitude",
    "dropoff_longitude", "dropoff_latitude", "payment_type", "fare_amount",
    "surcharge", "mta_tax", "tip_amount", "tolls_amount", "total_amount"
]


df = spark.read.option("header", "false").csv("data/sorted_data.csv")
df = df.toDF(*columns)
df.printSchema()

#Take 1GB of original data
df_sample = df.sample(withReplacement=False, fraction=0.1)

In [None]:
#converting pickup and dropoff datetimes to to_timestamp
df_sample = df_sample.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime"), "yyyy-MM-dd HH:mm:ss")) \
                     .withColumn("dropoff_datetime", to_timestamp(col("dropoff_datetime"), "yyyy-MM-dd HH:mm:ss"))

#Removing null or 0.0 columns and unknown licenses or drivers
df_clean = df_sample.filter(
    (col("medallion").isNotNull()) & (col("medallion") != "0") & (col("medallion") != "UNKNOWN") &
    (col("hack_license").isNotNull()) & (col("hack_license") != "0") & (col("hack_license") != "UNKNOWN") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &
    (col("trip_time_in_secs").isNotNull()) & (col("trip_time_in_secs") != 0) &
    (col("trip_distance").isNotNull()) & (col("trip_distance") != 0) &
    (col("pickup_longitude").isNotNull()) & (col("pickup_longitude") != 0.0) &
    (col("pickup_latitude").isNotNull()) & (col("pickup_latitude") != 0.0) &
    (col("dropoff_longitude").isNotNull()) & (col("dropoff_longitude") != 0.0) &
    (col("dropoff_latitude").isNotNull()) & (col("dropoff_latitude") != 0.0) &
    (col("trip_distance").cast("float") > 0) &
    (col("fare_amount").cast("float") > 0)
)

# Convert relevant columns to appropriate data types for numerical computations
df_clean = df_clean.withColumn("trip_time_in_secs", col("trip_time_in_secs").cast("int")) \
                   .withColumn("trip_distance", col("trip_distance").cast("float")) \
                   .withColumn("fare_amount", col("fare_amount").cast("float")) \
                   .withColumn("surcharge", col("surcharge").cast("float")) \
                   .withColumn("mta_tax", col("mta_tax").cast("float")) \
                   .withColumn("tip_amount", col("tip_amount").cast("float")) \
                   .withColumn("tolls_amount", col("tolls_amount").cast("float"))

df_clean.show(5)

In [None]:
df_clean.count()

In [None]:
# Build the Time Model
# Convert the pickup datetime to extract year, month, and day.
df_clean = df_clean.withColumn("pickup_year", year("pickup_datetime")) \
                   .withColumn("pickup_month", month("pickup_datetime")) \
                   .withColumn("pickup_day", dayofmonth("pickup_datetime"))

df_clean.show(5)

In [None]:
# Write Cleansed Data with File Partitioning (Parquet format)
output_path = "output/cleansed_taxi_data"
df_clean.write.partitionBy("pickup_year", "pickup_month", "pickup_day") \
    .format("parquet") \
    .mode("overwrite") \
    .save(output_path)

In [None]:
# 1. Compute the maximum dropoff datetime in the dataset
max_dropoff = df_clean.agg({"dropoff_datetime": "max"}).collect()[0][0]
print("Max dropoff datetime:", max_dropoff)

# 2. Define a reference time (30 minutes before max_dropoff)
ref_time = max_dropoff - timedelta(minutes=30)

# 3. Define grid cell size (here, 0.01 degrees is used as an approximation)
cell_size = 0.01

# 4. Create grid cell identifiers for pickup (start_cell) and drop-off (end_cell)
df_routes = df_clean.withColumn(
    "start_cell",
    concat(
        floor(col("pickup_latitude") / lit(cell_size)),
        lit("_"),
        floor(col("pickup_longitude") / lit(cell_size))
    )
).withColumn(
    "end_cell",
    concat(
        floor(col("dropoff_latitude") / lit(cell_size)),
        lit("_"),
        floor(col("dropoff_longitude") / lit(cell_size))
    )
)

# 5. Filter the DataFrame to include only trips whose dropoff time is within the last 30 minutes
df_last30 = df_routes.filter(col("dropoff_datetime") >= lit(ref_time))

# 6. Group by start and end cells and count the number of rides,
#    then rename the count column for clarity.
df_frequent_routes = df_last30.groupBy("start_cell", "end_cell").count() \
    .withColumnRenamed("count", "Number_of_Rides")

# 7. Order the routes by descending ride counts and take the top 10
top10_routes = df_frequent_routes.orderBy(col("Number_of_Rides").desc()).limit(10)

# 8. Show the results
top10_routes.show(truncate=False)

#### Part 2:  Query results must be updated whenever any of the 10 most frequent routes change.


In [None]:
# Writing the cleansed data to a Delta table in a writable directory.
df_clean.write.format("delta").mode("overwrite").save("/tmp/delta/taxi_data")
df_stream = spark.readStream.format("delta").load("/tmp/delta/taxi_data")

In [None]:
#Read the Delta table in batch mode
check_df = spark.read.format("delta").load("/tmp/delta/taxi_data")
check_df.show(5)

In [None]:
# Define your grid constants for the 500m x 500m grid
grid_origin_lat = 41.474937
grid_origin_lon = -74.913585
delta_lat = 0.0045   # Approximate degrees for 500m in latitude
delta_lon = 0.0060   # Approximate degrees for 500m in longitude

# This is your foreachBatch function to process each micro-batch
def process_batch(batch_df, batch_id):
    # Skip empty batches
    if batch_df.rdd.isEmpty():
        return

    # Compute the 30-minute window based on the batch’s max dropoff
    max_dropoff = batch_df.agg({"dropoff_datetime": "max"}).collect()[0][0]
    if max_dropoff is None:
        return
    ref_time = max_dropoff - timedelta(minutes=30)

    # Compute grid cell IDs for pickup and dropoff using the 500m grid
    batch_df = batch_df.withColumn(
        "pickup_cell_east", floor((col("pickup_longitude") - lit(grid_origin_lon)) / lit(delta_lon)) + 1
    ).withColumn(
        "pickup_cell_south", floor((lit(grid_origin_lat) - col("pickup_latitude")) / lit(delta_lat)) + 1
    ).withColumn(
        "start_cell", concat(col("pickup_cell_east").cast("int"), lit("."), col("pickup_cell_south").cast("int"))
    )
    batch_df = batch_df.withColumn(
        "dropoff_cell_east", floor((col("dropoff_longitude") - lit(grid_origin_lon)) / lit(delta_lon)) + 1
    ).withColumn(
        "dropoff_cell_south", floor((lit(grid_origin_lat) - col("dropoff_latitude")) / lit(delta_lat)) + 1
    ).withColumn(
        "end_cell", concat(col("dropoff_cell_east").cast("int"), lit("."), col("dropoff_cell_south").cast("int"))
    )

    # Filter out trips that are out-of-bounds (only consider cells 1 to 300)
    batch_df = batch_df.filter(
        (col("pickup_cell_east").between(1, 300)) &
        (col("pickup_cell_south").between(1, 300)) &
        (col("dropoff_cell_east").between(1, 300)) &
        (col("dropoff_cell_south").between(1, 300))
    )

    # Filter for trips with dropoff_datetime >= ref_time (last 30 minutes)
    df_last30 = batch_df.filter(col("dropoff_datetime") >= F.lit(ref_time))
    print(f"Window filter: dropoff_datetime >= {ref_time}")
    print("df_last30 count =", df_last30.count())
    df_last30.show(5)

    # Aggregate routes and get top 10 most frequent
    df_frequent_routes = df_last30.groupBy("start_cell", "end_cell") \
        .count() \
        .withColumnRenamed("count", "Number_of_Rides")
    top10_routes = df_frequent_routes.orderBy(col("Number_of_Rides").desc()).limit(10)
    top10_list = top10_routes.collect()

    # Determine a triggering event and compute delay
    # Choose the event with the maximum dropoff_datetime as the trigger
    trigger_row = batch_df.orderBy(col("dropoff_datetime").desc()).limit(1).collect()[0]
    trigger_pickup = trigger_row["pickup_datetime"]
    trigger_dropoff = trigger_row["dropoff_datetime"]
    ingest_time = trigger_row["ingest_time"]
    processing_time = datetime.now()
    delay = (processing_time - ingest_time).total_seconds()

    # Build the output row
    output_row = {
        "pickup_datetime": trigger_pickup,
        "dropoff_datetime": trigger_dropoff,
        "delay": delay
    }
    for i in range(10):
        if i < len(top10_list):
            route = top10_list[i]
            output_row[f"start_cell_id_{i+1}"] = route["start_cell"]
            output_row[f"end_cell_id_{i+1}"] = route["end_cell"]
        else:
            output_row[f"start_cell_id_{i+1}"] = None
            output_row[f"end_cell_id_{i+1}"] = None

    print(f"Update for batch {batch_id} :", output_row)
    
    # Define the output schema explicitly
    output_schema = StructType([
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("start_cell_id_1", StringType(), True),
        StructField("end_cell_id_1", StringType(), True),
        StructField("start_cell_id_2", StringType(), True),
        StructField("end_cell_id_2", StringType(), True),
        StructField("start_cell_id_3", StringType(), True),
        StructField("end_cell_id_3", StringType(), True),
        StructField("start_cell_id_4", StringType(), True),
        StructField("end_cell_id_4", StringType(), True),
        StructField("start_cell_id_5", StringType(), True),
        StructField("end_cell_id_5", StringType(), True),
        StructField("start_cell_id_6", StringType(), True),
        StructField("end_cell_id_6", StringType(), True),
        StructField("start_cell_id_7", StringType(), True),
        StructField("end_cell_id_7", StringType(), True),
        StructField("start_cell_id_8", StringType(), True),
        StructField("end_cell_id_8", StringType(), True),
        StructField("start_cell_id_9", StringType(), True),
        StructField("end_cell_id_9", StringType(), True),
        StructField("start_cell_id_10", StringType(), True),
        StructField("end_cell_id_10", StringType(), True),
        StructField("delay", DoubleType(), True)
    ])
    
    # Create the result DataFrame using the explicit schema
    result_df = spark.createDataFrame([output_row], schema=output_schema)
    
    # Write the result_df as a table (it will create the table if it doesn't exist)
    result_df.write.mode("append").saveAsTable("frequent_routes")
    
    # Optional: Show the result DataFrame
    result_df.show(truncate=False)

# Ensure that your streaming DataFrame has proper types and includes an ingest_time column
df_stream = df_stream.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime"), "yyyy-MM-dd HH:mm:ss"))
df_stream = df_stream.withColumn("dropoff_datetime", to_timestamp(col("dropoff_datetime"), "yyyy-MM-dd HH:mm:ss"))
df_stream = df_stream.withColumn("ingest_time", current_timestamp())

# Use trigger(once=True) to process existing data exactly one time
query = (
    df_stream.writeStream
    .trigger(once=True)  #This means the query to run just once
    .foreachBatch(process_batch)
    .outputMode("append")
    .start()
)

query.awaitTermination()

In [None]:
#Display the result
spark.sql("SELECT * FROM frequent_routes").show()

### Query 2: Profitable Areas

#### Part 1: Report only the 10 most profitable areas

In [None]:
# Grid constants for the 500m x 500m grid
grid_origin_lat = 41.474937
grid_origin_lon = -74.913585
delta_lat = 0.0045   # Approximate degrees for 500m in latitude
delta_lon = 0.0060   # Approximate degrees for 500m in longitude

def process_batch_query2(batch_df, batch_id):
    # Skip empty batches
    if batch_df.rdd.isEmpty():
        print(f"Batch {batch_id}: Empty batch.")
        return

    # Compute reference times based on the batch’s maximum dropoff_datetime
    max_dropoff = batch_df.agg({"dropoff_datetime": "max"}).collect()[0][0]
    if max_dropoff is None:
        print(f"Batch {batch_id}: max_dropoff is None.")
        return
    # For profit: consider trips ending in the last 15 minutes
    ref_time_profit = max_dropoff - timedelta(minutes=15)
    # For empty taxis: consider taxis whose last dropoff was within the last 30 minutes
    ref_time_empty = max_dropoff - timedelta(minutes=30)
    print(f"Batch {batch_id}: ref_time_profit = {ref_time_profit}, ref_time_empty = {ref_time_empty}")

    # Compute profit aggregate per area (using pickup location)
    profit_df = batch_df.filter(col("dropoff_datetime") >= F.lit(ref_time_profit)) \
        .withColumn("profit", col("fare_amount") + col("tip_amount")) \
        .withColumn(
            "pickup_cell_east",
            floor((col("pickup_longitude") - lit(grid_origin_lon)) / lit(delta_lon)) + 1
        ).withColumn(
            "pickup_cell_south",
            floor((lit(grid_origin_lat) - col("pickup_latitude")) / lit(delta_lat)) + 1
        ).withColumn(
            "pickup_cell",
            concat(col("pickup_cell_east").cast("int"), lit("."), col("pickup_cell_south").cast("int"))
        )
    profit_agg = profit_df.groupBy("pickup_cell") \
        .agg(F.expr("approx_percentile(profit, 0.5) as median_profit"))
    
    # Compute empty taxi aggregate per area (using dropoff location)
    w = Window.partitionBy("medallion").orderBy(col("dropoff_datetime").desc())
    last_dropoff_df = batch_df.withColumn("rn", F.row_number().over(w)) \
        .filter(col("rn") == 1)
    empty_df = last_dropoff_df.filter(col("dropoff_datetime") >= F.lit(ref_time_empty)) \
        .withColumn(
            "dropoff_cell_east",
            floor((col("dropoff_longitude") - lit(grid_origin_lon)) / lit(delta_lon)) + 1
        ).withColumn(
            "dropoff_cell_south",
            floor((lit(grid_origin_lat) - col("dropoff_latitude")) / lit(delta_lat)) + 1
        ).withColumn(
            "dropoff_cell",
            concat(col("dropoff_cell_east").cast("int"), lit("."), col("dropoff_cell_south").cast("int"))
        )
    empty_agg = empty_df.groupBy("dropoff_cell") \
        .agg(F.countDistinct("medallion").alias("empty_taxis"))
    
    # Join the two aggregates on the cell identifier.
    area_df = profit_agg.join(empty_agg, profit_agg.pickup_cell == empty_agg.dropoff_cell, "inner") \
        .select(profit_agg.pickup_cell.alias("cell_id"), "median_profit", "empty_taxis") \
        .filter(col("empty_taxis") > 0) \
        .withColumn("profitability", col("median_profit") / col("empty_taxis"))
    
    top10_areas = area_df.orderBy(col("profitability").desc()).limit(10)
    top10_list = top10_areas.collect()
    
    # Determine a triggering event and compute processing delay.
    trigger_row = batch_df.orderBy(col("dropoff_datetime").desc()).limit(1).collect()[0]
    trigger_pickup = trigger_row["pickup_datetime"]
    trigger_dropoff = trigger_row["dropoff_datetime"]
    # Use asDict() to safely check for "ingest_time"
    trigger_row_dict = trigger_row.asDict()
    if "ingest_time" in trigger_row_dict:
        ingest_time = trigger_row_dict["ingest_time"]
    else:
        ingest_time = trigger_dropoff  # fallback if missing
    processing_time = datetime.now()
    delay = (processing_time - ingest_time).total_seconds()
    
    # Build the output row.
    output_row = {
        "pickup_datetime": trigger_pickup,
        "dropoff_datetime": trigger_dropoff,
        "delay": delay
    }
    for i in range(10):
        if i < len(top10_list):
            area = top10_list[i]
            output_row[f"profitable_cell_id_{i+1}"] = area["cell_id"]
            output_row[f"empty_taxies_in_cell_id_{i+1}"] = str(area["empty_taxis"])
            output_row[f"median_profit_in_cell_id_{i+1}"] = area["median_profit"]
            output_row[f"profitability_of_cell_{i+1}"] = area["profitability"]
        else:
            output_row[f"profitable_cell_id_{i+1}"] = None
            output_row[f"empty_taxies_in_cell_id_{i+1}"] = None
            output_row[f"median_profit_in_cell_id_{i+1}"] = None
            output_row[f"profitability_of_cell_{i+1}"] = None

    print(f"Update for batch {batch_id}:", output_row)
    
    fields = [
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True)
    ]
    for i in range(10):
        fields.extend([
            StructField(f"profitable_cell_id_{i+1}", StringType(), True),
            StructField(f"empty_taxies_in_cell_id_{i+1}", StringType(), True),
            StructField(f"median_profit_in_cell_id_{i+1}", DoubleType(), True),
            StructField(f"profitability_of_cell_{i+1}", DoubleType(), True)
        ])
    fields.append(StructField("delay", DoubleType(), True))
    output_schema = StructType(fields)
    
    result_df = spark.createDataFrame([output_row], schema=output_schema)
    # Write the result_df as a table
    result_df.write.mode("append").saveAsTable("most_profitable_areas_result")
    
    result_df.show(truncate=False)

# Ensure your streaming DataFrame has an ingest_time column.
df_stream = df_stream.withColumn("ingest_time", current_timestamp())

query2 = (
    df_stream.writeStream
    .trigger(once=True) 
    .foreachBatch(process_batch_query2)
    .outputMode("append")
    .start()
)

query2.awaitTermination()


In [None]:
spark.sql("SELECT * FROM most_profitable_areas_result").show()

#### Part2: Resulting stream of the query provide the 10 most profitable areas 

In [None]:
# Grid Constants for 250m x 250m grid
# The grid’s center of cell 1.1 remains at (41.474937, -74.913585).
# For 250m resolution, we use half the previous deltas:
new_delta_lat = 0.0045 / 2    # ≈0.00225
new_delta_lon = 0.0060 / 2    # ≈0.0030

# Define the foreachBatch function for Query 2 Part 2
def process_batch_query2_part2(batch_df, batch_id):
    # Skip empty batches
    if batch_df.rdd.isEmpty():
        print(f"Batch {batch_id}: Empty batch.")
        return

    # Compute reference times using batch’s maximum dropoff_datetime.
    max_dropoff = batch_df.agg({"dropoff_datetime": "max"}).collect()[0][0]
    if max_dropoff is None:
        print(f"Batch {batch_id}: max_dropoff is None.")
        return
    # For profit computation, consider trips that ended in the last 15 minutes.
    ref_time_profit = max_dropoff - timedelta(minutes=15)
    # For empty taxis, consider taxis whose last dropoff was within the last 30 minutes.
    ref_time_empty = max_dropoff - timedelta(minutes=30)
    print(f"Batch {batch_id}: ref_time_profit = {ref_time_profit}, ref_time_empty = {ref_time_empty}")

    # Compute profit aggregate per area (using pickup location).
    # Only consider trips with dropoff_datetime >= ref_time_profit.
    profit_df = batch_df.filter(col("dropoff_datetime") >= F.lit(ref_time_profit)) \
        .withColumn("profit", col("fare_amount") + col("tip_amount")) \
        .withColumn(
            "pickup_cell_east",
            floor((col("pickup_longitude") - lit(grid_origin_lon)) / lit(new_delta_lon)) + 1
        ).withColumn(
            "pickup_cell_south",
            floor((lit(grid_origin_lat) - col("pickup_latitude")) / lit(new_delta_lat)) + 1
        ).withColumn(
            "pickup_cell",
            concat(col("pickup_cell_east").cast("int"), lit("."), col("pickup_cell_south").cast("int"))
        )
    profit_agg = profit_df.groupBy("pickup_cell") \
        .agg(F.expr("approx_percentile(profit, 0.5) as median_profit"))
    
    # Compute empty taxi aggregate per area (using dropoff location).
    # For each taxi (medallion), take the latest dropoff event.
    w = Window.partitionBy("medallion").orderBy(col("dropoff_datetime").desc())
    last_dropoff_df = batch_df.withColumn("rn", F.row_number().over(w)) \
        .filter(col("rn") == 1)
    empty_df = last_dropoff_df.filter(col("dropoff_datetime") >= F.lit(ref_time_empty)) \
        .withColumn(
            "dropoff_cell_east",
            floor((col("dropoff_longitude") - lit(grid_origin_lon)) / lit(new_delta_lon)) + 1
        ).withColumn(
            "dropoff_cell_south",
            floor((lit(grid_origin_lat) - col("dropoff_latitude")) / lit(new_delta_lat)) + 1
        ).withColumn(
            "dropoff_cell",
            concat(col("dropoff_cell_east").cast("int"), lit("."), col("dropoff_cell_south").cast("int"))
        )
    empty_agg = empty_df.groupBy("dropoff_cell") \
        .agg(F.countDistinct("medallion").alias("empty_taxis"))
    
    # Join the aggregates on the cell identifier.
    # (We assume the area is defined by the same grid cell for pickup and dropoff.)
    area_df = profit_agg.join(empty_agg, profit_agg.pickup_cell == empty_agg.dropoff_cell, "inner") \
        .select(profit_agg.pickup_cell.alias("cell_id"), "median_profit", "empty_taxis") \
        .filter(col("empty_taxis") > 0) \
        .withColumn("profitability", col("median_profit") / col("empty_taxis"))
    
    # Get the top 10 areas by profitability.
    top10_areas = area_df.orderBy(col("profitability").desc()).limit(10)
    top10_list = top10_areas.collect()
    
    # Determine a triggering event and compute processing delay.
    # Choose the event with maximum dropoff_datetime as the trigger.
    trigger_row = batch_df.orderBy(col("dropoff_datetime").desc()).limit(1).collect()[0]
    trigger_pickup = trigger_row["pickup_datetime"]
    trigger_dropoff = trigger_row["dropoff_datetime"]
    # Convert the row to a dictionary for safe field access.
    trigger_row_dict = trigger_row.asDict()
    # Use ingest_time if available; otherwise, use trigger_dropoff as fallback.
    ingest_time = trigger_row_dict["ingest_time"] if "ingest_time" in trigger_row_dict else trigger_dropoff
    processing_time = datetime.now()
    delay = (processing_time - ingest_time).total_seconds()
    
    # Build the output row.
    # The required output columns are:
    # pickup_datetime, dropoff_datetime, then for each of the 10 areas:
    # profitable_cell_id_i, empty_taxies_in_cell_id_i, median_profit_in_cell_id_i, profitability_of_cell_i, and finally delay.
    output_row = {
        "pickup_datetime": trigger_pickup,
        "dropoff_datetime": trigger_dropoff,
        "delay": delay
    }
    for i in range(10):
        if i < len(top10_list):
            area = top10_list[i]
            output_row[f"profitable_cell_id_{i+1}"] = area["cell_id"]
            output_row[f"empty_taxies_in_cell_id_{i+1}"] = area["empty_taxis"]  # as integer
            output_row[f"median_profit_in_cell_id_{i+1}"] = area["median_profit"]
            output_row[f"profitability_of_cell_{i+1}"] = area["profitability"]
        else:
            output_row[f"profitable_cell_id_{i+1}"] = None
            output_row[f"empty_taxies_in_cell_id_{i+1}"] = None
            output_row[f"median_profit_in_cell_id_{i+1}"] = None
            output_row[f"profitability_of_cell_{i+1}"] = None

    print(f"Update for batch {batch_id}:", output_row)
    
    # Define the output schema.
    out_fields = [
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True)
    ]
    for i in range(10):
        out_fields.extend([
            StructField(f"profitable_cell_id_{i+1}", StringType(), True),
            StructField(f"empty_taxies_in_cell_id_{i+1}", IntegerType(), True),
            StructField(f"median_profit_in_cell_id_{i+1}", DoubleType(), True),
            StructField(f"profitability_of_cell_{i+1}", DoubleType(), True)
        ])
    out_fields.append(StructField("delay", DoubleType(), True))
    output_schema = StructType(out_fields)
    
    result_df = spark.createDataFrame([output_row], schema=output_schema)
    # Write the result_df as a table
    result_df.write.mode("append").saveAsTable("profitable_areas_streaming_result")
    
    result_df.show(truncate=False)

# Ensure your streaming DataFrame (streaming_df) has an ingest_time column.
df_stream = df_stream.withColumn("ingest_time", current_timestamp())

# Set up the streaming query using foreachBatch.
query2_part2 = (
    df_stream.writeStream
    .trigger(once=True) 
    .foreachBatch(process_batch_query2_part2)
    .outputMode("append")
    .start()
)

query2_part2.awaitTermination()


In [None]:
spark.sql("SELECT * FROM profitable_areas_streaming_result").show()