In [1]:
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1111,application_1732639283265_1075,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: None
Executor Memory: 4743M
Executor Cores: 2

In [2]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "1g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1112,application_1732639283265_1076,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1106,application_1732639283265_1070,pyspark,idle,Link,Link,,
1107,application_1732639283265_1071,pyspark,idle,Link,Link,,
1108,application_1732639283265_1072,pyspark,idle,Link,Link,,
1109,application_1732639283265_1073,pyspark,idle,Link,Link,,
1110,application_1732639283265_1074,pyspark,idle,Link,Link,,
1112,application_1732639283265_1076,pyspark,idle,Link,Link,,✔


In [4]:
import time
import csv
from io import StringIO

# Start time
start_time_rdd = time.time()

# Load data from both files as RDDs
crime_rdd_2010_2019 = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")
crime_rdd_2020_present = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")

# Extract headers
header_2010_2019 = crime_rdd_2010_2019.first()
header_2020_present = crime_rdd_2020_present.first()

# Remove headers from both RDDs
crime_data_2010_2019 = crime_rdd_2010_2019.filter(lambda line: line != header_2010_2019)
crime_data_2020_present = crime_rdd_2020_present.filter(lambda line: line != header_2020_present)

# Combine both RDDs into one
combined_crime_data = crime_data_2010_2019.union(crime_data_2020_present)

# Parse CSV correctly
crime_parsed = combined_crime_data.map(lambda line: list(csv.reader(StringIO(line)))[0]).filter(lambda x: len(x) > 27)

# Filter for "aggravated assault" in crime description
aggravated_assault = crime_parsed.filter(lambda x: "aggravated assault" in x[9].strip().lower())

# Categorize ages
def categorize_age(age):
    try:
        age = int(age)
        if age < 18:
            return "Children"
        elif 18 <= age <= 24:
            return "Young Adults"
        elif 25 <= age <= 64:
            return "Adults"
        else:
            return "Seniors"
    except ValueError:
        return None

age_group_rdd = aggravated_assault.map(lambda x: (categorize_age(x[11]), 1)).filter(lambda x: x[0] is not None)

# Count victims by age group
age_group_counts = age_group_rdd.reduceByKey(lambda a, b: a + b)

# Sort results by count in descending order
sorted_age_groups = age_group_counts.sortBy(lambda x: x[1], ascending=False)

# Collect and display results
print(sorted_age_groups.collect())

# End time
end_time_rdd = time.time()
print(f"RDD Execution Time: {end_time_rdd - start_time_rdd} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('Adults', 121093), ('Young Adults', 33605), ('Children', 15928), ('Seniors', 5985)]
RDD Execution Time: 41.49943780899048 seconds

In [7]:
from pyspark.sql.functions import col, when, count
import time

# Start time
start_time_df = time.time()

# Load the first dataset as DataFrame
crime_df_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True,
    inferSchema=True
)

# Rename columns to trim spaces (if needed)
crime_df_2010_2019 = crime_df_2010_2019.toDF(*[col_name.strip() for col_name in crime_df_2010_2019.columns])

# Load the second dataset as DataFrame
crime_df_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True,
    inferSchema=True
)

# Rename columns to trim spaces (if needed)
crime_df_2020_present = crime_df_2020_present.toDF(*[col_name.strip() for col_name in crime_df_2020_present.columns])

# Ensure schemas match
crime_df_2020_present = crime_df_2020_present.select(crime_df_2010_2019.columns)

# Combine both DataFrames
combined_crime_df = crime_df_2010_2019.union(crime_df_2020_present)

# Filter for "AGGRAVATED ASSAULT"
filtered_df = combined_crime_df.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Categorize ages into groups
categorized_df = filtered_df.withColumn(
    "AgeGroup",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .otherwise("Seniors")
)

# Count victims by age group
result_df = categorized_df.groupBy("AgeGroup").agg(count("*").alias("Count")).orderBy(col("Count").desc())

# Display results
result_df.show()

# End time
end_time_df = time.time()
print(f"DataFrame Execution Time: {end_time_df - start_time_df} seconds")




FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|    AgeGroup| Count|
+------------+------+
|      Adults|121093|
|Young Adults| 33605|
|    Children| 15928|
|     Seniors|  5985|
+------------+------+

DataFrame Execution Time: 19.49064564704895 seconds