In [1]:
%%configure -f
{
  "conf": {
    "spark.executor.instances": "4",
    "spark.executor.cores": "1",
    "spark.executor.memory": "2g",
    "spark.driver.memory": "2g"
  }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
17,application_1765289937462_0018,pyspark,busy,Link,Link,,
20,application_1765289937462_0021,pyspark,idle,Link,Link,,
22,application_1765289937462_0023,pyspark,idle,Link,Link,,
23,application_1765289937462_0024,pyspark,idle,Link,Link,,
24,application_1765289937462_0025,pyspark,idle,Link,Link,,
31,application_1765289937462_0032,pyspark,idle,Link,Link,,
40,application_1765289937462_0041,pyspark,idle,Link,Link,,
42,application_1765289937462_0043,pyspark,idle,Link,Link,,
43,application_1765289937462_0044,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, trim, count, udf
from pyspark.sql.types import BooleanType
import time

start_time = time.time()

spark = SparkSession.builder.getOrCreate()

# 1. Force Cartesian / Disable Optimizations
spark.conf.set("spark.sql.join.preferSortMergeJoin", "false")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
spark.conf.set("spark.sql.crossJoin.enabled", "true")

crime_2010_2019 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime_2020_2025 = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)

# --- SAMPLING ADDED HERE ---
# taking only 0.1% of the data so the Cartesian join finishes in seconds
combined_crime = crime_2010_2019.unionByName(crime_2020_2025).sample(withReplacement=False, fraction=0.001, seed=42)

mo_df = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
).withColumn("split_cols", split(col("value"), " ", 2)) \
 .withColumn("MO_Code_Right", col("split_cols").getItem(0)) \
 .withColumn("MO_Desc", col("split_cols").getItem(1)) \
 .drop("value", "split_cols")

combined_crime_exploded = combined_crime.withColumn(
    "MO_Code",
    explode(split(col("Mocodes"), " "))
).withColumn("MO_Code", trim(col("MO_Code")))

# The "Blind" UDF to force the join
@udf(returnType=BooleanType())
def blind_equality(x, y):
    return x == y

joined_df_replicate = combined_crime_exploded.crossJoin(
    mo_df
).filter(
    blind_equality(col("MO_Code"), col("MO_Code_Right"))
)

mo_summary_replicate = (
    joined_df_replicate
    .groupBy("MO_Code", "MO_Desc")
    .agg(count("*").alias("Frequency"))
    .filter(
        (col("MO_Code").isNotNull()) & (trim(col("MO_Code")) != "") &
        (col("MO_Desc").isNotNull()) & (trim(col("MO_Desc")) != "")
    )
    .orderBy(col("Frequency").desc())
)

results_replicate = mo_summary_replicate.collect()

# Print results
page_size = 50
total_rows = len(results_replicate)

print(f"{'MO Code':<10} | {'Description':<50} | {'Frequency':<10}")
print("-" * 80)

for i in range(0, total_rows, page_size):
    chunk = results_replicate[i:i + page_size]
    for row in chunk:
        print(f"{row['MO_Code']:<10} | {row['MO_Desc']:<50} | {row['Frequency']:<10}")

end_time = time.time()

print(f"\nShuffle Replicate NL Join Execution Time: {end_time - start_time:.2f} sec\n")

# CHECK THIS OUTPUT FOR 'CartesianProduct'
mo_summary_replicate.explain(mode="extended")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
45,application_1765289937462_0046,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code    | Description                                        | Frequency 
--------------------------------------------------------------------------------
0344       | Removes vict property                              | 1045      
1822       | Stranger                                           | 574       
0416       | Hit-Hit w/ weapon                                  | 397       
0329       | Vandalized                                         | 383       
0913       | Victim knew Suspect                                | 282       
2000       | Domestic violence                                  | 220       
0400       | Force used                                         | 209       
1300       | Vehicle involved                                   | 204       
1402       | Evidence Booked (any crime)                        | 184       
1202       | Victim was aged (60 & over) or blind/physically disabled/unable to care for self | 146       
0444       | Pushed                       