In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import time
import csv

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
#Both Dataframe and RDD implementations are to use 4 spark executors

spark = SparkSession.builder \
    .appName("Query1 DataFrame API") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

In [23]:
# Query1 Dataframe implementation

# Start timer
start_time = time.time()

# Load Crime Data
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)

# Filter for "AGGRAVATED ASSAULT"
assault_data = crime_data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Age Groups
categorized = assault_data.withColumn(
    "AgeGroup",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Seniors")
)

# Group and count
result_df = categorized.groupBy("AgeGroup").agg(count("*").alias("Count")).orderBy(col("Count").desc())

#Show results
result_df.show()

# Stop timer and print elapsed time
elapsed_time = time.time() - start_time
print(f"Execution Time (DataFrame API): {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----+
|    AgeGroup|Count|
+------------+-----+
|      Adults|72610|
|Young Adults|23472|
|    Children|10724|
|     Seniors| 3099|
+------------+-----+

Execution Time (DataFrame API): 19.31 seconds

In [30]:
# Show dataset schema to find indexes for RDD
crime_data.printSchema()

# Display sample rows
crime_data.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [33]:
#Query 1 RDD implementation

# Start timer
start_time = time.time()

# Load dataset as RDD
crime_rdd = spark.sparkContext.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")

# Extract header and filter it out
header = crime_rdd.first()
crime_rdd = crime_rdd.filter(lambda row: row != header)

# Parse CSV rows
def parse_csv(line):
    return list(csv.reader([line]))[0]

parsed_rdd = crime_rdd.map(parse_csv)

# Filter for "AGGRAVATED ASSAULT"
assault_rdd = parsed_rdd.filter(lambda row: "AGGRAVATED ASSAULT" in row[9])

# Age groups
age_group_rdd = assault_rdd.map(lambda row: (
    "Children" if int(row[11]) < 18 else
    "Young Adults" if 18 <= int(row[11]) <= 24 else
    "Adults" if 25 <= int(row[11]) <= 64 else
    "Seniors"
))

# Group and count
result_rdd = age_group_rdd.map(lambda group: (group, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False)

# Show results
for group, count in result_rdd.collect():
    print(f"{group}: {count}")

# Stop timer and print elapsed time
elapsed_time = time.time() - start_time
print(f"Execution Time (RDD API): {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 72610
Young Adults: 23472
Children: 10724
Seniors: 3099
Execution Time (RDD API): 21.01 seconds

In [None]:
#test