In [1]:
%%configure -f
{
    "conf" : { 
        "spark.executor.instances" : "4" ,
        "spark.executor.cores" : "1" ,
        "spark.executor.memory" : "2g" ,
        "spark.driver.memory" : "4g"
    }
}


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
655,application_1765289937462_0648,pyspark,idle,Link,Link,,
656,application_1765289937462_0649,pyspark,idle,Link,Link,,
664,application_1765289937462_0657,pyspark,idle,Link,Link,,
665,application_1765289937462_0658,pyspark,idle,Link,Link,,
666,application_1765289937462_0659,pyspark,idle,Link,Link,,
667,application_1765289937462_0660,pyspark,idle,Link,Link,,
668,application_1765289937462_0661,pyspark,idle,Link,Link,,
669,application_1765289937462_0662,pyspark,idle,Link,Link,,
670,application_1765289937462_0663,pyspark,idle,Link,Link,,
671,application_1765289937462_0664,pyspark,idle,Link,Link,,


In [3]:
from pyspark.sql import SparkSession
import time
import csv

# Paths για τα crime data
crime_2010_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
crime_2020_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

# Implementation 3: RDD API

sc = SparkSession \
    .builder \
    .appName("RDD query 1 execution") \
    .getOrCreate() \
    .sparkContext

# ========================================
# Load and process data
# ========================================

print("====================================================================")
print("Loading crime data as RDDs (2010–2019 & 2020–2025)...")

t0 = time.time()

# Διαβάζουμε raw text
crime_2010_raw = sc.textFile(crime_2010_path)
crime_2020_raw = sc.textFile(crime_2020_path)

# Παίρνουμε headers (πρώτη γραμμή από κάθε αρχείο)
header_2010 = crime_2010_raw.first()
header_2020 = crime_2020_raw.first()

# Μικρός CSV parser 
def parse_csv_line(line):
    return next(csv.reader([line]))

# Σπάμε το header του 2010 σε στήλες
header_cols = parse_csv_line(header_2010)

# Εντοπίζουμε τα indexes που μας χρειάζονται
idx_crm_cd_desc = header_cols.index("Crm Cd Desc")
idx_vict_age    = header_cols.index("Vict Age")

# Πετάμε τα header rows & κάνουμε parsing κάθε γραμμή
crime_2010_rows = crime_2010_raw \
    .filter(lambda line: line != header_2010) \
    .map(parse_csv_line)

crime_2020_rows = crime_2020_raw \
    .filter(lambda line: line != header_2020) \
    .map(parse_csv_line)

# Ενώνουμε τα δύο RDDs (όπως κάναμε union σε διάφορα παραδείγματα)
crime_rdd = crime_2010_rows.union(crime_2020_rows)

t1 = time.time()
print(f"[INFO] RDD loading + union: {t1 - t0:.2f} sec")

print("====================================================================")
print("RDD pipeline: filter aggravated assaults and count age groups...")

# =======================
# Helper functions

# =======================

def is_aggravated_rdd(row):
    """
    row: list of strings (columns)
    Ελέγχουμε Crm Cd Desc (case-insensitive) για 'AGGRAVATED ASSAULT'
    """
    desc = row[idx_crm_cd_desc]
    return "AGGRAVATED ASSAULT" in desc.upper()

def age_group_from_row_rdd(row):
    """
    row: list of strings
    Διαβάζουμε Vict Age και επιστρέφουμε age_group
    """
    age_str = row[idx_vict_age]
    if age_str == "":
        return None  # αγνοούμε κενές ηλικίες

    a = int(age_str)
    if a <= 0:
        return None  # ίδιο λογικό φιλτράρισμα με DF (Vict Age > 0)

    if a < 18:
        return "Children (<18)"
    elif a <= 24:
        return "Young adults (18-24)"
    elif a <= 64:
        return "Adults (25-64)"
    else:
        return "Seniors (>64)"

t2 = time.time()

# 1) Κρατάμε μόνο aggravated assault
aggravated_rdd = crime_rdd.filter(is_aggravated_rdd)

# 2) Εξάγουμε age_group ή None
age_groups_rdd = aggravated_rdd.map(age_group_from_row_rdd)

# 3) Πετάμε τα None
valid_age_groups_rdd = age_groups_rdd.filter(lambda g: g is not None)

# 4) Μετράμε occurrences ανά age_group (όπως WordCount)
age_group_counts_rdd = valid_age_groups_rdd \
    .map(lambda g: (g, 1)) \
    .reduceByKey(lambda a, b: a + b)

# 5) Ταξινόμηση κατά count φθίνουσα
sorted_age_group_counts_rdd = age_group_counts_rdd.sortBy(lambda kv: kv[1], ascending=False)

result_rdd = sorted_age_group_counts_rdd.collect()

t3 = time.time()

for age_group, count in result_rdd:
    print(f"{age_group}: {count}")

print(f"[TIMING] RDD pipeline: {t3 - t2:.2f} sec")
print("====================================================================")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Loading crime data as RDDs (2010?2019 & 2020?2025)...
[INFO] RDD loading + union: 4.16 sec
RDD pipeline: filter aggravated assaults and count age groups...
Adults (25-64): 121660
Young adults (18-24): 33758
Children (<18): 10904
Seniors (>64): 6011
[TIMING] RDD pipeline: 14.85 sec