In [1]:
%%configure -f
{
  "conf": {
    "spark.executor.instances": "4",
    "spark.executor.cores": "1",
    "spark.executor.memory": "2g",
    "spark.driver.memory": "4g",
    "spark.sql.shuffle.partitions": "8"
  }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
772,application_1765289937462_0765,pyspark,idle,Link,Link,,
774,application_1765289937462_0767,pyspark,idle,Link,Link,,
789,application_1765289937462_0782,pyspark,idle,Link,Link,,
791,application_1765289937462_0784,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DoubleType
from pyspark.sql.functions import (
    col, substring, sum as F_sum, round as F_round, row_number
)
from pyspark.sql.window import Window
import time

# Paths
crime_2010_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
crime_2020_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
re_codes_path   = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/RE_codes.csv"

# Schema για crime data (όπως στο Query 1)
crime_schema = StructType([
    StructField("DR_NO",          StringType()),
    StructField("Date Rptd",      StringType()),
    StructField("DATE OCC",       StringType()),
    StructField("TIME OCC",       StringType()),
    StructField("AREA",           StringType()),
    StructField("AREA NAME",      StringType()),
    StructField("Rpt Dist No",    StringType()),
    StructField("Part 1-2",       StringType()),
    StructField("Crm Cd",         StringType()),
    StructField("Crm Cd Desc",    StringType()),
    StructField("Mocodes",        StringType()),
    StructField("Vict Age",       IntegerType()),
    StructField("Vict Sex",       StringType()),
    StructField("Vict Descent",   StringType()),
    StructField("Premis Cd",      StringType()),
    StructField("Premis Desc",    StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc",    StringType()),
    StructField("Status",         StringType()),
    StructField("Status Desc",    StringType()),
    StructField("Crm Cd 1",       StringType()),
    StructField("Crm Cd 2",       StringType()),
    StructField("Crm Cd 3",       StringType()),
    StructField("Crm Cd 4",       StringType()),
    StructField("LOCATION",       StringType()),
    StructField("Cross Street",   StringType()),
    StructField("LAT",            DoubleType()),
    StructField("LON",            DoubleType())
])

# Schema για RE_codes
re_codes_schema = StructType([
    StructField("Vict Descent",      StringType()),
    StructField("Vict Descent Full", StringType())
])

# Implementation 1: DataFrame API for Query 2

spark = SparkSession \
    .builder \
    .appName("DF query 2 execution") \
    .getOrCreate()

print("====================================================================")
print("Loading crime data & RE codes as DataFrames...")

t0 = time.time()

# Διαβάζουμε crime data με explicit schema
crime_2010_df = spark.read.format("csv") \
    .options(header="true") \
    .schema(crime_schema) \
    .load(crime_2010_path)

crime_2020_df = spark.read.format("csv") \
    .options(header="true") \
    .schema(crime_schema) \
    .load(crime_2020_path)

crime_df = crime_2010_df.unionByName(crime_2020_df)

# Διαβάζουμε RE_codes (φυλετικά γκρουπ)
re_codes_df = spark.read.format("csv") \
    .options(header="true") \
    .schema(re_codes_schema) \
    .load(re_codes_path)

t1 = time.time()
print(f"[INFO] Data loading + union: {t1 - t0:.2f} sec")
#crime_df.show(3, truncate=False)
#re_codes_df.show(5, truncate=False)

print("====================================================================")
print("DF pipeline for Query 2 (per year, top-3 Vict Descent)...")

t2 = time.time()

# 1) Εξάγουμε year από DATE OCC (πρώτα 4 ψηφία) και κρατάμε μόνο όσες έχουν Vict Descent
crime_with_year_df = crime_df \
    .withColumn("year", substring(col("DATE OCC"), 1, 4).cast("int")) \
    .filter(col("year").isNotNull()) \
    .filter(col("Vict Descent").isNotNull() & (col("Vict Descent") != ""))

# 2) Group by (year, Vict Descent) και count victims
year_descent_counts_df = crime_with_year_df \
    .groupBy("year", "Vict Descent") \
    .count()

# 3) Σύνολο θυμάτων ανά έτος
year_totals_df = year_descent_counts_df \
    .groupBy("year") \
    .agg(F_sum("count").alias("total_victims"))

# 4) Join για να υπολογίσουμε ποσοστά
with_totals_df = year_descent_counts_df.join(year_totals_df, on="year")

with_percent_df = with_totals_df \
    .withColumn("percentage", (col("count") / col("total_victims")) * 100.0)

# 5) Join με RE_codes για να πάρουμε full περιγραφή (Vict Descent Full)
with_labels_df = with_percent_df.join(
    re_codes_df,
    on="Vict Descent",
    how="left"
)

# 6) Top-3 φυλετικά groups ανά έτος (Window + row_number)
w = Window.partitionBy("year").orderBy(col("count").desc())

ranked_df = with_labels_df \
    .withColumn("rn", row_number().over(w))

top3_per_year_df = ranked_df \
    .filter(col("rn") <= 3) \
    .select(
        col("year"),
        col("Vict Descent Full").alias("Victim_Descent"),
        col("count").alias("num_victims"),
        F_round(col("percentage"), 2).alias("percentage")
    ) \
    .orderBy(col("year"), col("num_victims").desc())



top3_per_year_df.show(60, truncate=False)

t3 = time.time()
print(f"[TIMING] DF Query 2 pipeline: {t3 - t2:.2f} sec")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
801,application_1765289937462_0794,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Loading crime data & RE codes as DataFrames...
[INFO] Data loading + union: 3.35 sec
DF pipeline for Query 2 (per year, top-3 Vict Descent)...
+----+----------------------+-----------+----------+
|year|Victim_Descent        |num_victims|percentage|
+----+----------------------+-----------+----------+
|2010|Hispanic/Latin/Mexican|73558      |38.93     |
|2010|White                 |53835      |28.49     |
|2010|Black                 |33937      |17.96     |
|2011|Hispanic/Latin/Mexican|70845      |38.8      |
|2011|White                 |51219      |28.05     |
|2011|Black                 |32579      |17.84     |
|2012|Hispanic/Latin/Mexican|70338      |38.25     |
|2012|White                 |51839      |28.19     |
|2012|Black                 |33572      |18.26     |
|2013|Hispanic/Latin/Mexican|66741      |37.97     |
|2013|White                 |48453      |27.57     |
|2013|Black                 |31975      |18.19     |
|2014|Hispanic/Latin/Mexican|68763      |38.42     |
|2014|Whi