In [3]:
%%configure -f
{
  "conf": {
    "spark.executor.instances": "2",
    "spark.executor.cores": "1",
    "spark.executor.memory": "2g",
    "spark.driver.memory": "2g"
  }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
322,application_1764662801237_0324,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
260,application_1764662801237_0263,pyspark,idle,Link,Link,,
275,application_1764662801237_0278,pyspark,idle,Link,Link,,
277,application_1764662801237_0280,pyspark,idle,Link,Link,,
278,application_1764662801237_0281,pyspark,idle,Link,Link,,
279,application_1764662801237_0282,pyspark,idle,Link,Link,,
280,application_1764662801237_0283,pyspark,idle,Link,Link,,
281,application_1764662801237_0284,pyspark,idle,Link,Link,,
288,application_1764662801237_0291,pyspark,idle,Link,Link,,
300,application_1764662801237_0302,pyspark,idle,Link,Link,,
301,application_1764662801237_0303,pyspark,idle,Link,Link,,


In [4]:
def run_query_4():
    import time
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import (
        col, count, avg, round as spark_round,
        radians, sin, cos, sqrt, asin
    )
    from pyspark.sql.functions import min as spark_min

    query_start_time = time.time()

    spark = SparkSession.builder.getOrCreate()

    stations = spark.read.option("header", True).csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv"
    ).withColumnRenamed("DIVISION", "division") \
     .withColumnRenamed("X", "station_lon") \
     .withColumnRenamed("Y", "station_lat") \
     .withColumn("station_lon", col("station_lon").cast("double")) \
     .withColumn("station_lat", col("station_lat").cast("double"))

    crime_2010_2019 = spark.read.option("header", True).csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
    )
    crime_2020_2025 = spark.read.option("header", True).csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
    )
    combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

    crime_coords = combined_crime.select(
        col("DR_NO"),
        col("LAT").cast("double").alias("crime_lat"),
        col("LON").cast("double").alias("crime_lon")
    ).filter(
        (col("crime_lat").isNotNull()) &
        (col("crime_lon").isNotNull()) &
        ~((col("crime_lat") == 0) & (col("crime_lon") == 0))
    )

    def haversine_expr(lat1, lon1, lat2, lon2):
        return 2 * 6371.0 * asin(
            sqrt(
                sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
                cos(radians(lat1)) * cos(radians(lat2)) *
                sin((radians(lon2) - radians(lon1)) / 2) ** 2
            )
        )

    crime_station = crime_coords.crossJoin(stations) \
        .withColumn(
            "distance",
            haversine_expr(
                col("crime_lat"), col("crime_lon"),
                col("station_lat"), col("station_lon")
            )
        )

    crime_min = crime_station.groupBy("DR_NO") \
        .agg(spark_min("distance").alias("min_distance"))

    crime_nearest = crime_min.join(
        crime_station,
        (crime_min.DR_NO == crime_station.DR_NO) &
        (crime_min.min_distance == crime_station.distance),
        "inner"
    ).select("division", "distance")

    division_stats = crime_nearest.groupBy("division") \
        .agg(
            spark_round(avg("distance"), 3).alias("average_distance"),
            count("*").alias("#")
        ).select("division", "average_distance", "#") \
         .orderBy(col("#").desc())

    query_end_time = time.time()
    print(f"Query 4 build time: {query_end_time - query_start_time:.2f} seconds")

    return division_stats

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
import time 
df1 = run_query_4()
start_time = time.time()
df1.show(50, truncate=False)
end_time = time.time()
print(f"Query 4 query time: {end_time - start_time:.2f} seconds")
print("Plan:")
df1.explain(mode="extended")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…