In [1]:
%%configure -f
{
    "conf" : { 
        "spark.executor.instances" : "4" ,
        "spark.executor.cores" : "1" ,
        "spark.executor.memory" : "2g" ,
        "spark.driver.memory" : "4g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
664,application_1765289937462_0657,pyspark,idle,Link,Link,,
666,application_1765289937462_0659,pyspark,idle,Link,Link,,
667,application_1765289937462_0660,pyspark,idle,Link,Link,,
668,application_1765289937462_0661,pyspark,idle,Link,Link,,
669,application_1765289937462_0662,pyspark,idle,Link,Link,,
671,application_1765289937462_0664,pyspark,idle,Link,Link,,
672,application_1765289937462_0665,pyspark,idle,Link,Link,,
673,application_1765289937462_0666,pyspark,idle,Link,Link,,
687,application_1765289937462_0680,pyspark,idle,Link,Link,,
690,application_1765289937462_0683,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, lower, when
import time

# Paths για τα crime data
crime_2010_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
crime_2020_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

# Schema – ΟΛΕΣ οι στήλες όπως στο sample
crime_schema = StructType([
    StructField("DR_NO",          StringType()),
    StructField("Date Rptd",      StringType()),
    StructField("DATE OCC",       StringType()),
    StructField("TIME OCC",       StringType()),
    StructField("AREA",           StringType()),
    StructField("AREA NAME",      StringType()),
    StructField("Rpt Dist No",    StringType()),
    StructField("Part 1-2",       StringType()),
    StructField("Crm Cd",         StringType()),
    StructField("Crm Cd Desc",    StringType()),
    StructField("Mocodes",        StringType()),
    StructField("Vict Age",       IntegerType()),
    StructField("Vict Sex",       StringType()),
    StructField("Vict Descent",   StringType()),
    StructField("Premis Cd",      StringType()),
    StructField("Premis Desc",    StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc",    StringType()),
    StructField("Status",         StringType()),
    StructField("Status Desc",    StringType()),
    StructField("Crm Cd 1",       StringType()),
    StructField("Crm Cd 2",       StringType()),
    StructField("Crm Cd 3",       StringType()),
    StructField("Crm Cd 4",       StringType()),
    StructField("LOCATION",       StringType()),
    StructField("Cross Street",   StringType()),
    StructField("LAT",            DoubleType()),
    StructField("LON",            DoubleType())
])

# Implementation 1: DataFrame API (χωρίς UDF)

spark = SparkSession \
    .builder \
    .appName("DF Query 1 execution (no UDF)") \
    .getOrCreate()

print("====================================================================")
print("Loading crime data as DataFrames with explicit schema...")

t0 = time.time()

crime_2010_df = spark.read.format("csv") \
    .options(header="true") \
    .schema(crime_schema) \
    .load(crime_2010_path)

crime_2020_df = spark.read.format("csv") \
    .options(header="true") \
    .schema(crime_schema) \
    .load(crime_2020_path)

crime_df = crime_2010_df.unionByName(crime_2020_df)

t1 = time.time()
print(f"[INFO] Data loading + union: {t1 - t0:.2f} sec")
#crime_df.printSchema()
#crime_df.show(5, truncate=False)

print("====================================================================")
print("Filtering aggravated assaults and computing age groups (no UDF)...")

t2 = time.time()

# Φιλτράρουμε μόνο aggravated assault (case-insensitive)
aggravated_df = crime_df.filter(
    col("Crm Cd Desc").isNotNull() &
    lower(col("Crm Cd Desc")).contains("aggravated assault")
)

# Φιλτράρουμε ηλικίες (Vict Age > 0)
aggr_with_age_df = aggravated_df.filter(
    col("Vict Age").isNotNull() & (col("Vict Age") > 0)
)

# Ορισμός buckets μόνο με built-in expressions
age_grouped_df = aggr_with_age_df \
    .withColumn(
        "age_group",
        when(col("Vict Age") < 18, "Children (<18)") \
        .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young adults (18-24)") \
        .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults (25-64)") \
        .otherwise("Seniors (>64)")
    ) \
    .groupBy("age_group") \
    .count() \
    .orderBy(col("count").desc())

t3 = time.time()

age_grouped_df.show(truncate=False)
print(f"[TIMING] DF (no UDF) pipeline: {t3 - t2:.2f} sec")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
692,application_1765289937462_0685,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Loading crime data as DataFrames with explicit schema...
[INFO] Data loading + union: 2.88 sec
Filtering aggravated assaults and computing age groups (no UDF)...
+--------------------+------+
|age_group           |count |
+--------------------+------+
|Adults (25-64)      |121660|
|Young adults (18-24)|33758 |
|Children (<18)      |10904 |
|Seniors (>64)       |6011  |
+--------------------+------+

[TIMING] DF (no UDF) pipeline: 0.15 sec