In [7]:
%%configure -f
{
  "conf": {
    "spark.executor.instances": "4",
    "spark.executor.cores": "1",
    "spark.executor.memory": "2g",
    "spark.driver.memory": "2g"
  }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
758,application_1761923966900_0770,pyspark,idle,Link,Link,,
761,application_1761923966900_0773,pyspark,idle,Link,Link,,
803,application_1761923966900_0815,pyspark,idle,Link,Link,,
814,application_1761923966900_0826,pyspark,idle,Link,Link,,
825,application_1761923966900_0837,pyspark,idle,Link,Link,,
837,application_1761923966900_0849,pyspark,idle,Link,Link,,
839,application_1761923966900_0851,pyspark,idle,Link,Link,,


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count
import time

spark = SparkSession.builder.getOrCreate()

spark.catalog.clearCache()

start_time = time.time()

# Load datasets
crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)
combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

# Load MO codes
mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

# Explode multiple MO codes
combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

# Join without hints
joined_df = combined_crime_exploded.join(
    mo_df,
    on="MO_Code",
    how="left"
)



# Summary with filtering OUT unknowns
mo_summary = (
    joined_df
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

print("=== Catalyst default join ===")
mo_summary.explain(mode="extended")

# Execute and collect
results = mo_summary.collect()

# Scrollable printing: display in chunks of 50 rows
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for i in range(0, total_rows, page_size):
    chunk = results[i:i+page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")

print(f"Execution time: {time.time() - start_time:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== Catalyst default join ===
== Parsed Logical Plan ==
'Sort ['Frequency DESC NULLS LAST], true
+- Filter (((isnotnull(MO_Code#1092) AND NOT (trim(MO_Code#1092, None) = )) AND isnotnull(MO_Desc#1054)) AND NOT (trim(MO_Desc#1054, None) = ))
   +- Aggregate [MO_Code#1092, MO_Desc#1054], [MO_Code#1092, MO_Desc#1054, count(1) AS Frequency#1183L]
      +- Project [MO_Code#1092, DR_NO#885, Date Rptd#886, DATE OCC#887, TIME OCC#888, AREA#889, AREA NAME#890, Rpt Dist No#891, Part 1-2#892, Crm Cd#893, Crm Cd Desc#894, Mocodes#895, Vict Age#896, Vict Sex#897, Vict Descent#898, Premis Cd#899, Premis Desc#900, Weapon Used Cd#901, Weapon Desc#902, Status#903, Status Desc#904, Crm Cd 1#905, Crm Cd 2#906, Crm Cd 3#907, ... 6 more fields]
         +- Join LeftOuter, (MO_Code#1092 = MO_Code#1050)
            :- Project [DR_NO#885, Date Rptd#886, DATE OCC#887, TIME OCC#888, AREA#889, AREA NAME#890, Rpt Dist No#891, Part 1-2#892, Crm Cd#893, Crm Cd Desc#894, Mocodes#895, Vict Age#896, Vict Sex#897, Vict

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count, broadcast
import time

spark = SparkSession.builder.getOrCreate()
spark.catalog.clearCache()

start_time = time.time()

# ---- Load datasets ----
crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)
combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

# ---- Load MO codes ----
mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

# ---- Explode multiple MO codes ----
combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

# ---- Broadcast join ----
joined_df = combined_crime_exploded.join(
    broadcast(mo_df),
    on="MO_Code",
    how="left"
)

# ---- Summary ----
mo_summary = (
    joined_df
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

print("=== Broadcast Join ===")
mo_summary.explain(mode="extended")

# ---- Execute and scrollable print ----
results = mo_summary.collect()
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)
for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")
    if start_idx + page_size < total_rows:
        print(f"-- Showing rows {start_idx+1}-{min(start_idx+page_size, total_rows)} of {total_rows} --\n")
print(f"Execution time: {time.time() - start_time:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== Broadcast Join ===
== Parsed Logical Plan ==
'Sort ['Frequency DESC NULLS LAST], true
+- Filter (((isnotnull(MO_Code#2438) AND NOT (trim(MO_Code#2438, None) = )) AND isnotnull(MO_Desc#2400)) AND NOT (trim(MO_Desc#2400, None) = ))
   +- Aggregate [MO_Code#2438, MO_Desc#2400], [MO_Code#2438, MO_Desc#2400, count(1) AS Frequency#2529L]
      +- Project [MO_Code#2438, DR_NO#2231, Date Rptd#2232, DATE OCC#2233, TIME OCC#2234, AREA#2235, AREA NAME#2236, Rpt Dist No#2237, Part 1-2#2238, Crm Cd#2239, Crm Cd Desc#2240, Mocodes#2241, Vict Age#2242, Vict Sex#2243, Vict Descent#2244, Premis Cd#2245, Premis Desc#2246, Weapon Used Cd#2247, Weapon Desc#2248, Status#2249, Status Desc#2250, Crm Cd 1#2251, Crm Cd 2#2252, Crm Cd 3#2253, ... 6 more fields]
         +- Join LeftOuter, (MO_Code#2438 = MO_Code#2396)
            :- Project [DR_NO#2231, Date Rptd#2232, DATE OCC#2233, TIME OCC#2234, AREA#2235, AREA NAME#2236, Rpt Dist No#2237, Part 1-2#2238, Crm Cd#2239, Crm Cd Desc#2240, Mocodes#2241, Vict 

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count
import time

spark = SparkSession.builder.getOrCreate()
spark.catalog.clearCache()

start_time = time.time()

# ---- Load datasets ----
crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)
combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

# ---- Load MO codes ----
mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

# ---- Explode multiple MO codes ----
combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

# ---- Merge join hint ----
joined_df = combined_crime_exploded.hint("MERGE").join(
    mo_df.hint("MERGE"),
    on="MO_Code",
    how="left"
)

# ---- Summary ----
mo_summary = (
    joined_df
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

print("=== Merge Join Hint ===")
mo_summary.explain(mode="extended")

# ---- Execute and scrollable print ----
results = mo_summary.collect()
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)
for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")
    if start_idx + page_size < total_rows:
        print(f"-- Showing rows {start_idx+1}-{min(start_idx+page_size, total_rows)} of {total_rows} --\n")
print(f"Execution time: {time.time() - start_time:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== Merge Join Hint ===
== Parsed Logical Plan ==
'Sort ['Frequency DESC NULLS LAST], true
+- Filter (((isnotnull(MO_Code#2764) AND NOT (trim(MO_Code#2764, None) = )) AND isnotnull(MO_Desc#2726)) AND NOT (trim(MO_Desc#2726, None) = ))
   +- Aggregate [MO_Code#2764, MO_Desc#2726], [MO_Code#2764, MO_Desc#2726, count(1) AS Frequency#2855L]
      +- Project [MO_Code#2764, DR_NO#2557, Date Rptd#2558, DATE OCC#2559, TIME OCC#2560, AREA#2561, AREA NAME#2562, Rpt Dist No#2563, Part 1-2#2564, Crm Cd#2565, Crm Cd Desc#2566, Mocodes#2567, Vict Age#2568, Vict Sex#2569, Vict Descent#2570, Premis Cd#2571, Premis Desc#2572, Weapon Used Cd#2573, Weapon Desc#2574, Status#2575, Status Desc#2576, Crm Cd 1#2577, Crm Cd 2#2578, Crm Cd 3#2579, ... 6 more fields]
         +- Join LeftOuter, (MO_Code#2764 = MO_Code#2722)
            :- ResolvedHint (strategy=merge)
            :  +- Project [DR_NO#2557, Date Rptd#2558, DATE OCC#2559, TIME OCC#2560, AREA#2561, AREA NAME#2562, Rpt Dist No#2563, Part 1-2#2564, C

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count
import time

spark = SparkSession.builder.getOrCreate()
spark.catalog.clearCache()

start_time = time.time()

# ---- Load datasets ----
crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)
combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

# ---- Load MO codes ----
mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

# ---- Explode multiple MO codes ----
combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

# ---- Shuffle hash join hint ----
joined_df = combined_crime_exploded.hint("SHUFFLE_HASH").join(
    mo_df.hint("SHUFFLE_HASH"),
    on="MO_Code",
    how="left"
)

# ---- Summary ----
mo_summary = (
    joined_df
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

print("=== Shuffle Hash Join ===")
mo_summary.explain(mode="extended")

# ---- Execute and scrollable print ----
results = mo_summary.collect()
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)
for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")
    if start_idx + page_size < total_rows:
        print(f"-- Showing rows {start_idx+1}-{min(start_idx+page_size, total_rows)} of {total_rows} --\n")
print(f"Execution time: {time.time() - start_time:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== Shuffle Hash Join ===
== Parsed Logical Plan ==
'Sort ['Frequency DESC NULLS LAST], true
+- Filter (((isnotnull(MO_Code#3090) AND NOT (trim(MO_Code#3090, None) = )) AND isnotnull(MO_Desc#3052)) AND NOT (trim(MO_Desc#3052, None) = ))
   +- Aggregate [MO_Code#3090, MO_Desc#3052], [MO_Code#3090, MO_Desc#3052, count(1) AS Frequency#3181L]
      +- Project [MO_Code#3090, DR_NO#2883, Date Rptd#2884, DATE OCC#2885, TIME OCC#2886, AREA#2887, AREA NAME#2888, Rpt Dist No#2889, Part 1-2#2890, Crm Cd#2891, Crm Cd Desc#2892, Mocodes#2893, Vict Age#2894, Vict Sex#2895, Vict Descent#2896, Premis Cd#2897, Premis Desc#2898, Weapon Used Cd#2899, Weapon Desc#2900, Status#2901, Status Desc#2902, Crm Cd 1#2903, Crm Cd 2#2904, Crm Cd 3#2905, ... 6 more fields]
         +- Join LeftOuter, (MO_Code#3090 = MO_Code#3048)
            :- ResolvedHint (strategy=shuffle_hash)
            :  +- Project [DR_NO#2883, Date Rptd#2884, DATE OCC#2885, TIME OCC#2886, AREA#2887, AREA NAME#2888, Rpt Dist No#2889, Part 1-

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count
import time

spark = SparkSession.builder.getOrCreate()
spark.catalog.clearCache()

start_time = time.time()

# ---- Load datasets ----
crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)
combined_crime = crime_2010_2019.unionByName(crime_2020_2025)

# ---- Load MO codes ----
mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

# ---- Explode multiple MO codes ----
combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

# ---- Shuffle Replicate NL join hint ----
joined_df = combined_crime_exploded.hint("SHUFFLE_REPLICATE_NL").join(
    mo_df.hint("SHUFFLE_REPLICATE_NL"),
    on="MO_Code",
    how="left"
)

# ---- Summary ----
mo_summary = (
    joined_df
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

print("=== Shuffle Replicate NL Join ===")
mo_summary.explain(mode="extended")

# ---- Execute and scrollable print ----
results = mo_summary.collect()
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)
for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")
    if start_idx + page_size < total_rows:
        print(f"-- Showing rows {start_idx+1}-{min(start_idx+page_size, total_rows)} of {total_rows} --\n")
print(f"Execution time: {time.time() - start_time:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== Shuffle Replicate NL Join ===
== Parsed Logical Plan ==
'Sort ['Frequency DESC NULLS LAST], true
+- Filter (((isnotnull(MO_Code#3416) AND NOT (trim(MO_Code#3416, None) = )) AND isnotnull(MO_Desc#3378)) AND NOT (trim(MO_Desc#3378, None) = ))
   +- Aggregate [MO_Code#3416, MO_Desc#3378], [MO_Code#3416, MO_Desc#3378, count(1) AS Frequency#3507L]
      +- Project [MO_Code#3416, DR_NO#3209, Date Rptd#3210, DATE OCC#3211, TIME OCC#3212, AREA#3213, AREA NAME#3214, Rpt Dist No#3215, Part 1-2#3216, Crm Cd#3217, Crm Cd Desc#3218, Mocodes#3219, Vict Age#3220, Vict Sex#3221, Vict Descent#3222, Premis Cd#3223, Premis Desc#3224, Weapon Used Cd#3225, Weapon Desc#3226, Status#3227, Status Desc#3228, Crm Cd 1#3229, Crm Cd 2#3230, Crm Cd 3#3231, ... 6 more fields]
         +- Join LeftOuter, (MO_Code#3416 = MO_Code#3374)
            :- ResolvedHint (strategy=shuffle_replicate_nl)
            :  +- Project [DR_NO#3209, Date Rptd#3210, DATE OCC#3211, TIME OCC#3212, AREA#3213, AREA NAME#3214, Rpt Dist 

In [13]:
from pyspark.sql import SparkSession
import time

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

spark.catalog.clearCache()

# ============================================
# Load CSV files as RDDs
# ============================================
start_time = time.time()
crime_2010_2019_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv")
        .rdd
)

crime_2020_2025_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv")
        .rdd
)

combined_rdd = crime_2010_2019_rdd.union(crime_2020_2025_rdd)

# Convert rows to dictionaries for RDD processing
combined_rdd = combined_rdd.map(lambda row: row.asDict())

# ============================================
# Load MO codes (TXT file)
# ============================================

mo_rdd = (
    sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt")
        .map(lambda line: line.strip().split(" ", 1))
        .filter(lambda parts: len(parts) == 2 and parts[0].strip() != "" and parts[1].strip() != "")
        .map(lambda parts: (parts[0].strip(), parts[1].strip()))
)

# ============================================
# Explode Mocodes (RDD version)
# ============================================

exploded_rdd = (
    combined_rdd
    .flatMap(lambda row: [
        (m.strip(), row) for m in (row.get("Mocodes") or "").split(" ")
        if m.strip() != ""
    ])
)

# ============================================
# Join with MO descriptions using RDD join()
# ============================================

joined_rdd = exploded_rdd.leftOuterJoin(mo_rdd)

# Filter out unknown MO_Code or MO_Desc
filtered_rdd = joined_rdd.filter(
    lambda x: x[0] is not None and x[0].strip() != ""
              and x[1][1] is not None and x[1][1].strip() != ""
)

# Map to ((MO_Code, MO_Desc), 1) and reduce
summary_rdd = (
    filtered_rdd
    .map(lambda x: ((x[0], x[1][1]), 1))
    .reduceByKey(lambda a, b: a + b)
)

# Sort by frequency descending
sorted_rdd = summary_rdd.sortBy(lambda x: -x[1])

results = sorted_rdd.collect()

# ============================================
# Scrollable printing: display in chunks
# ============================================

page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for ((mo_code, desc), freq) in chunk:
        print(f"{mo_code:<10} | {desc:<50} | {freq:<10}")

print(f"Execution time: {time.time() - start_time:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code    | Description                                        | Frequency 
--------------------------------------------------------------------------------
0344       | Removes vict property                              | 1002900   
1822       | Stranger                                           | 548422    
0416       | Hit-Hit w/ weapon                                  | 404773    
0329       | Vandalized                                         | 377536    
0913       | Victim knew Suspect                                | 278618    
2000       | Domestic violence                                  | 256188    
1300       | Vehicle involved                                   | 219082    
0400       | Force used                                         | 213165    
1402       | Evidence Booked (any crime)                        | 177470    
1609       | Smashed                                            | 131229    
1309       | Susp uses vehicle                                  | 122108

In [14]:
from pyspark.sql import SparkSession
import time

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
start = time.time()
# ---- Load Crime Data as RDD ----
crime_2010_2019_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv")
        .rdd
)
crime_2020_2025_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv")
        .rdd
)
combined_rdd = crime_2010_2019_rdd.union(crime_2020_2025_rdd)
combined_rdd = combined_rdd.map(lambda row: row.asDict())

# ---- Load MO codes ----
mo_rdd = (
    sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt")
        .map(lambda line: line.strip().split(" ", 1))
        .filter(lambda parts: len(parts) == 2 and parts[0].strip() != "" and parts[1].strip() != "")
        .map(lambda parts: (parts[0].strip(), parts[1].strip()))
)

# ---- Broadcast MO codes ----
mo_dict = dict(mo_rdd.collect())
bc_mo = sc.broadcast(mo_dict)

# ---- Explode Mocodes ----
exploded = combined_rdd.flatMap(
    lambda row: [(m.strip(), row) for m in (row.get("Mocodes") or "").split(" ") if m.strip() != ""]
)

# ---- Join using broadcast ----

joined = exploded.map(lambda x: (x[0], bc_mo.value.get(x[0]))).filter(lambda x: x[1] is not None)

# ---- Count frequencies ----
summary = joined.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda x: -x[1])
results = summary.collect()
end_time = time.time()

# ---- Scrollable printing in chunks ----
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for (key, freq) in chunk:
        print(f"{key[0]:<10} | {key[1]:<50} | {freq}")

print(f"Execution time: {end_time - start:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code    | Description                                        | Frequency 
--------------------------------------------------------------------------------
0344       | Removes vict property                              | 1002900
1822       | Stranger                                           | 548422
0416       | Hit-Hit w/ weapon                                  | 404773
0329       | Vandalized                                         | 377536
0913       | Victim knew Suspect                                | 278618
2000       | Domestic violence                                  | 256188
1300       | Vehicle involved                                   | 219082
0400       | Force used                                         | 213165
1402       | Evidence Booked (any crime)                        | 177470
1609       | Smashed                                            | 131229
1309       | Susp uses vehicle                                  | 122108
1202       | Victim was aged (60 & ove

In [15]:
from pyspark.sql import SparkSession
import time

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
start = time.time()
# ---- Load Crime Data as RDD ----
crime_2010_2019_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv")
        .rdd
)
crime_2020_2025_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv")
        .rdd
)
combined_rdd = crime_2010_2019_rdd.union(crime_2020_2025_rdd)
combined_rdd = combined_rdd.map(lambda row: row.asDict())

# ---- Load MO codes ----
mo_rdd = (
    sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt")
        .map(lambda line: line.strip().split(" ", 1))
        .filter(lambda parts: len(parts) == 2 and parts[0].strip() != "" and parts[1].strip() != "")
        .map(lambda parts: (parts[0].strip(), parts[1].strip()))
)

# ---- Explode Mocodes ----
exploded = combined_rdd.flatMap(
    lambda row: [(m.strip(), row) for m in (row.get("Mocodes") or "").split(" ") if m.strip() != ""]
)

# ---- Sort both RDDs by MO_Code ----
exploded_sorted = exploded.sortBy(lambda x: x[0])
mo_sorted = mo_rdd.sortBy(lambda x: x[0])

# ---- Join using sort-merge logic ----

joined = exploded_sorted.join(mo_sorted)

# ---- Count frequencies ----
summary = joined.map(lambda x: ((x[0], x[1][1]), 1)).reduceByKey(lambda a, b: a + b)
results = summary.sortBy(lambda x: -x[1]).collect()
end_time = time.time()

# ---- Scrollable printing in chunks ----
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for ((code, desc), freq) in chunk:
        print(f"{code:<10} | {desc:<50} | {freq}")

print(f"Execution time: {end_time - start:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code    | Description                                        | Frequency 
--------------------------------------------------------------------------------
0344       | Removes vict property                              | 1002900
1822       | Stranger                                           | 548422
0416       | Hit-Hit w/ weapon                                  | 404773
0329       | Vandalized                                         | 377536
0913       | Victim knew Suspect                                | 278618
2000       | Domestic violence                                  | 256188
1300       | Vehicle involved                                   | 219082
0400       | Force used                                         | 213165
1402       | Evidence Booked (any crime)                        | 177470
1609       | Smashed                                            | 131229
1309       | Susp uses vehicle                                  | 122108
1202       | Victim was aged (60 & ove

In [16]:
from pyspark.sql import SparkSession
import time

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
start = time.time()
# ---- Load Crime Data as RDD ----
crime_2010_2019_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv")
        .rdd
)
crime_2020_2025_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv")
        .rdd
)
combined_rdd = crime_2010_2019_rdd.union(crime_2020_2025_rdd)
combined_rdd = combined_rdd.map(lambda row: row.asDict())

# ---- Load MO codes ----
mo_rdd = (
    sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt")
        .map(lambda line: line.strip().split(" ", 1))
        .filter(lambda parts: len(parts) == 2 and parts[0].strip() != "" and parts[1].strip() != "")
        .map(lambda parts: (parts[0].strip(), parts[1].strip()))
)

# ---- Explode Mocodes ----
exploded = combined_rdd.flatMap(
    lambda row: [(m.strip(), row) for m in (row.get("Mocodes") or "").split(" ") if m.strip() != ""]
)

# ---- Partition both RDDs ----
exploded_partitioned = exploded.partitionBy(200)
mo_partitioned = mo_rdd.partitionBy(200)

# ---- Hash shuffle join equivalent ----

joined = exploded_partitioned.join(mo_partitioned)

# ---- Count frequencies ----
summary = joined.map(lambda x: ((x[0], x[1][1]), 1)).reduceByKey(lambda a, b: a + b)
results = summary.sortBy(lambda x: -x[1]).collect()
end_time = time.time()

# ---- Scrollable printing in chunks ----
page_size = 50
total_rows = len(results)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for start_idx in range(0, total_rows, page_size):
    chunk = results[start_idx:start_idx + page_size]
    for ((code, desc), freq) in chunk:
        print(f"{code:<10} | {desc:<50} | {freq}")

print(f"Execution time: {end_time - start:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code    | Description                                        | Frequency 
--------------------------------------------------------------------------------
0344       | Removes vict property                              | 1002900
1822       | Stranger                                           | 548422
0416       | Hit-Hit w/ weapon                                  | 404773
0329       | Vandalized                                         | 377536
0913       | Victim knew Suspect                                | 278618
2000       | Domestic violence                                  | 256188
1300       | Vehicle involved                                   | 219082
0400       | Force used                                         | 213165
1402       | Evidence Booked (any crime)                        | 177470
1609       | Smashed                                            | 131229
1309       | Susp uses vehicle                                  | 122108
1202       | Victim was aged (60 & ove

In [17]:
from pyspark.sql import SparkSession
import time

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
start = time.time()
# ---- Load Crime Data as RDD ----
crime_2010_2019_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv")
        .rdd
)
crime_2020_2025_rdd = (
    spark.read.option("header", "true")
        .csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv")
        .rdd
)
combined_rdd = crime_2010_2019_rdd.union(crime_2020_2025_rdd)
combined_rdd = combined_rdd.map(lambda row: row.asDict())

# ---- Load MO codes and collect for replicate ----
mo_list = (
    sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt")
        .map(lambda line: line.strip().split(" ", 1))
        .filter(lambda parts: len(parts) == 2 and parts[0].strip() != "" and parts[1].strip() != "")
        .map(lambda parts: (parts[0].strip(), parts[1].strip()))
        .collect()
)

# ---- Explode Mocodes and replicate ----
results_rdd = combined_rdd.flatMap(
    lambda row: [
        (code, desc)
        for (code, desc) in mo_list
        if code in (row.get("Mocodes") or "")
    ]
)

# ---- Count frequencies ----
summary = results_rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
final = summary.sortBy(lambda x: -x[1]).collect()
end_time = time.time()

# ---- Scrollable printing in chunks ----
page_size = 50
total_rows = len(final)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for start_idx in range(0, total_rows, page_size):
    chunk = final[start_idx:start_idx + page_size]
    for (key, freq) in chunk:
        print(f"{key[0]:<10} | {key[1]:<50} | {freq}")

print(f"Execution time: {end_time - start:.2f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code    | Description                                        | Frequency 
--------------------------------------------------------------------------------
0344       | Removes vict property                              | 1002900
1822       | Stranger                                           | 548422
0416       | Hit-Hit w/ weapon                                  | 404773
0329       | Vandalized                                         | 377536
0913       | Victim knew Suspect                                | 278618
2000       | Domestic violence                                  | 256188
1300       | Vehicle involved                                   | 219082
0400       | Force used                                         | 213165
1402       | Evidence Booked (any crime)                        | 177470
1609       | Smashed                                            | 131229
1309       | Susp uses vehicle                                  | 122108
1202       | Victim was aged (60 & ove