In [1]:
%%configure -f
{
  "conf": {
    "spark.executor.instances": "4",
    "spark.executor.cores": "1",
    "spark.executor.memory": "2g",
    "spark.driver.memory": "2g"
  }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
383,application_1764662801237_0385,pyspark,idle,Link,Link,,
393,application_1764662801237_0395,pyspark,idle,Link,Link,,
405,application_1764662801237_0407,pyspark,idle,Link,Link,,
412,application_1764662801237_0414,pyspark,idle,Link,Link,,
416,application_1764662801237_0418,pyspark,idle,Link,Link,,
447,,pyspark,starting,,,,


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count
import time

start_time = time.time()

spark = SparkSession.builder.getOrCreate()

crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)
combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

joined_df = combined_crime_exploded.join(
    mo_df,
    on="MO_Code",
    how="left"
)

mo_summary = (
    joined_df
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

results = mo_summary.collect()

page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for i in range(0, total_rows, page_size):
    chunk = results[i:i + page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")

end_time = time.time()

print(f"\nExecution time: {end_time - start_time:.2f} sec\n")

print("\n=== Catalyst default join (Execution Plan) ===")
mo_summary.explain(mode="extended")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
448,application_1764662801237_0450,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…