### Query 1

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import time
import csv

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
#Both Dataframe and RDD implementations are to use 4 spark executors

spark = SparkSession.builder \
    .appName("Query1 DataFrame API") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

In [23]:
# Query1 Dataframe implementation

# Start timer
start_time = time.time()

# Load Crime Data
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)

# Filter for "AGGRAVATED ASSAULT"
assault_data = crime_data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Age Groups
categorized = assault_data.withColumn(
    "AgeGroup",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Seniors")
)

# Group and count
result_df = categorized.groupBy("AgeGroup").agg(count("*").alias("Count")).orderBy(col("Count").desc())

#Show results
result_df.show()

# Stop timer and print elapsed time
elapsed_time = time.time() - start_time
print(f"Execution Time (DataFrame API): {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----+
|    AgeGroup|Count|
+------------+-----+
|      Adults|72610|
|Young Adults|23472|
|    Children|10724|
|     Seniors| 3099|
+------------+-----+

Execution Time (DataFrame API): 19.31 seconds

In [30]:
# Show dataset schema to find indexes for RDD
crime_data.printSchema()

# Display sample rows
crime_data.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [33]:
#Query 1 RDD implementation

# Start timer
start_time = time.time()

# Load dataset as RDD
crime_rdd = spark.sparkContext.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")

# Extract header and filter it out
header = crime_rdd.first()
crime_rdd = crime_rdd.filter(lambda row: row != header)

# Parse CSV rows
def parse_csv(line):
    return list(csv.reader([line]))[0]

parsed_rdd = crime_rdd.map(parse_csv)

# Filter for "AGGRAVATED ASSAULT"
assault_rdd = parsed_rdd.filter(lambda row: "AGGRAVATED ASSAULT" in row[9])

# Age groups
age_group_rdd = assault_rdd.map(lambda row: (
    "Children" if int(row[11]) < 18 else
    "Young Adults" if 18 <= int(row[11]) <= 24 else
    "Adults" if 25 <= int(row[11]) <= 64 else
    "Seniors"
))

# Group and count
result_rdd = age_group_rdd.map(lambda group: (group, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False)

# Show results
for group, count in result_rdd.collect():
    print(f"{group}: {count}")

# Stop timer and print elapsed time
elapsed_time = time.time() - start_time
print(f"Execution Time (RDD API): {elapsed_time:.2f} seconds")

#Stop spark session
spark.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 72610
Young Adults: 23472
Children: 10724
Seniors: 3099
Execution Time (RDD API): 21.01 seconds

In [21]:
print(crime_data.columns)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA ', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON']

In [22]:
print(crime_data.schema)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('DR_NO', IntegerType(), True), StructField('Date Rptd', StringType(), True), StructField('DATE OCC', StringType(), True), StructField('TIME OCC', IntegerType(), True), StructField('AREA ', IntegerType(), True), StructField('AREA NAME', StringType(), True), StructField('Rpt Dist No', IntegerType(), True), StructField('Part 1-2', IntegerType(), True), StructField('Crm Cd', IntegerType(), True), StructField('Crm Cd Desc', StringType(), True), StructField('Mocodes', StringType(), True), StructField('Vict Age', IntegerType(), True), StructField('Vict Sex', StringType(), True), StructField('Vict Descent', StringType(), True), StructField('Premis Cd', IntegerType(), True), StructField('Premis Desc', StringType(), True), StructField('Weapon Used Cd', IntegerType(), True), StructField('Weapon Desc', StringType(), True), StructField('Status', StringType(), True), StructField('Status Desc', StringType(), True), StructField('Crm Cd 1', IntegerType(), True), StructField('Crm

### Query 2

### i) Data_Frame API

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, when, row_number, expr
from pyspark.sql.window import Window
import time

# Start timer
start_time = time.time()

# Start Spark session
spark = SparkSession.builder \
    .appName("Query2 DataFrame API") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Load datasets
crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine both datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# Aggregation logic
aggregated = crime_data.groupBy(
    expr("substring(`DATE OCC`, 7, 4)").alias("YEAR"),  # Extract year from DATE OCC
    col("AREA NAME")
).agg(
    count("*").alias("total_cases"),
    sum(when(~col("Status Desc").isin("UNK", "Invest Cont"), 1).otherwise(0)).alias("closed_cases")  # Non-"UNK"/"Invest Cont" are closed
).withColumn("closed_case_rate", col("closed_cases") / col("total_cases"))

# Define window specification for ranking within each year
window_spec = Window.partitionBy("YEAR").orderBy(col("closed_case_rate").desc())

# Assign rank and filter top 3 precincts per year
ranked = aggregated.withColumn("ranking", row_number().over(window_spec)) \
    .filter(col("ranking") <= 3) \
    .orderBy("YEAR", "ranking")

# Measure the end time for the DataFrame API operations
dataframe_api_end_time = time.time()

# Count rows for showing all results
row_count = ranked.count()

# Show all rows in the output
ranked.show(truncate=False, n=row_count)

# Print DataFrame API execution time
print(f"DataFrame API Execution Time: {dataframe_api_end_time - dataframe_api_start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-----------+------------+-------------------+-------+
|YEAR|AREA NAME  |total_cases|closed_cases|closed_case_rate   |ranking|
+----+-----------+-----------+------------+-------------------+-------+
|2010|Rampart    |8707       |2860        |0.32847134489491214|1      |
|2010|Olympic    |8764       |2762        |0.3151528982199909 |2      |
|2010|Harbor     |9598       |2818        |0.2936028339237341 |3      |
|2011|Olympic    |7988       |2799        |0.35040060090135206|1      |
|2011|Rampart    |8444       |2744        |0.324964471814306  |2      |
|2011|Harbor     |9841       |2806        |0.2851336246316431 |3      |
|2012|Olympic    |8543       |2930        |0.3429708533302119 |1      |
|2012|Rampart    |8626       |2800        |0.3246000463714352 |2      |
|2012|Harbor     |9441       |2786        |0.29509585848956676|3      |
|2013|Olympic    |8305       |2789        |0.3358217940999398 |1      |
|2013|Rampart    |8148       |2616        |0.32106038291605304|2

### ii) SQL API

In [61]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
import time

# Start timer
sql_api_start_time = time.time()
# Start Spark session
spark = SparkSession.builder \
    .appName("Query2 SQL API") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Load datasets
crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

# Register the DataFrame as a temporary SQL table
crime_data.createOrReplaceTempView("crime_data")

# SQL query
sql_query = """
    WITH Aggregated AS (
        SELECT
            SUBSTRING(`DATE OCC`, 7, 4) AS YEAR,
            `AREA NAME` AS Precinct,
            COUNT(*) AS total_cases,
            SUM(CASE WHEN `Status Desc` NOT IN ('UNK', 'Invest Cont') THEN 1 ELSE 0 END) AS closed_cases,
            SUM(CASE WHEN `Status Desc` NOT IN ('UNK', 'Invest Cont') THEN 1 ELSE 0 END) / COUNT(*) AS closed_case_rate
        FROM crime_data
        GROUP BY SUBSTRING(`DATE OCC`, 7, 4), `AREA NAME`
    ),
    Ranked AS (
        SELECT
            YEAR,
            Precinct,
            total_cases,
            closed_cases,
            closed_case_rate,
            ROW_NUMBER() OVER (PARTITION BY YEAR ORDER BY closed_case_rate DESC) AS ranking
        FROM Aggregated
    )
    SELECT
        YEAR,
        Precinct,
        total_cases,
        closed_cases,
        closed_case_rate,
        ranking
    FROM Ranked
    WHERE ranking <= 3
    ORDER BY YEAR, ranking
"""



# Execute the SQL query
result = spark.sql(sql_query)

# Count rows to ensure all results are shown
row_count = result.count()

# Show the results
result.show(truncate=False, n=row_count)

# End timer
sql_api_end_time = time.time()

# Print SQL API execution time
print(f"SQL API Execution Time: {sql_api_end_time - sql_api_start_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-----------+------------+-------------------+-------+
|YEAR|Precinct   |total_cases|closed_cases|closed_case_rate   |ranking|
+----+-----------+-----------+------------+-------------------+-------+
|2010|Rampart    |8707       |2860        |0.32847134489491214|1      |
|2010|Olympic    |8764       |2762        |0.3151528982199909 |2      |
|2010|Harbor     |9598       |2818        |0.2936028339237341 |3      |
|2011|Olympic    |7988       |2799        |0.35040060090135206|1      |
|2011|Rampart    |8444       |2744        |0.324964471814306  |2      |
|2011|Harbor     |9841       |2806        |0.2851336246316431 |3      |
|2012|Olympic    |8543       |2930        |0.3429708533302119 |1      |
|2012|Rampart    |8626       |2800        |0.3246000463714352 |2      |
|2012|Harbor     |9441       |2786        |0.29509585848956676|3      |
|2013|Olympic    |8305       |2789        |0.3358217940999398 |1      |
|2013|Rampart    |8148       |2616        |0.32106038291605304|2

csv to parquet transition

In [53]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder \
    .appName("Save Crime Data as Parquet") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Load datasets
crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# Save as a single Parquet file to the specified S3 bucket
output_path = "s3://groups-bucket-dblab-905418150721/group28/query2/"
crime_data.repartition(1).write.mode("overwrite").parquet(output_path)

print(f"Data successfully saved to {output_path} in Parquet format.")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Data successfully saved to s3://groups-bucket-dblab-905418150721/group28/query2/ in Parquet format.

In [30]:
#parquet solution
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
import time

# Start timer
sql_api_start_time = time.time()

# Start Spark session
spark = SparkSession.builder \
    .appName("Query2 SQL API") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Crime Dataset in Parquet format
crime_data = spark.read.parquet(
    "s3://groups-bucket-dblab-905418150721/group28/query2/part-00000-fea3c04b-7961-41ea-8e05-d62534cf766e-c000.snappy.parquet"
)

# Register the DataFrame as a temporary SQL table
crime_data.createOrReplaceTempView("crime_data")

# SQL query
sql_query = """
    WITH Aggregated AS (
        SELECT
            SUBSTRING(`DATE OCC`, 7, 4) AS YEAR,
            `AREA NAME` AS Precinct,
            COUNT(*) AS total_cases,
            SUM(CASE WHEN `Status Desc` NOT IN ('UNK', 'Invest Cont') THEN 1 ELSE 0 END) AS closed_cases,
            SUM(CASE WHEN `Status Desc` NOT IN ('UNK', 'Invest Cont') THEN 1 ELSE 0 END) / COUNT(*) AS closed_case_rate
        FROM crime_data
        GROUP BY SUBSTRING(`DATE OCC`, 7, 4), `AREA NAME`
    ),
    Ranked AS (
        SELECT
            YEAR,
            Precinct,
            total_cases,
            closed_cases,
            closed_case_rate,
            ROW_NUMBER() OVER (PARTITION BY YEAR ORDER BY closed_case_rate DESC) AS ranking
        FROM Aggregated
    )
    SELECT
        YEAR,
        Precinct,
        total_cases,
        closed_cases,
        closed_case_rate,
        ranking
    FROM Ranked
    WHERE ranking <= 3
    ORDER BY YEAR, ranking
"""

# Execute the SQL query
result = spark.sql(sql_query)

# Count rows to ensure all results are shown
row_count = result.count()

# Show the results
result.show(truncate=False, n=row_count)

# End timer
sql_api_end_time = time.time()

# Print SQL API execution time
print(f"SQL API Execution Time: {sql_api_end_time - sql_api_start_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-----------+------------+-------------------+-------+
|YEAR|Precinct   |total_cases|closed_cases|closed_case_rate   |ranking|
+----+-----------+-----------+------------+-------------------+-------+
|2010|Rampart    |8707       |2860        |0.32847134489491214|1      |
|2010|Olympic    |8764       |2762        |0.3151528982199909 |2      |
|2010|Harbor     |9598       |2818        |0.2936028339237341 |3      |
|2011|Olympic    |7988       |2799        |0.35040060090135206|1      |
|2011|Rampart    |8444       |2744        |0.324964471814306  |2      |
|2011|Harbor     |9841       |2806        |0.2851336246316431 |3      |
|2012|Olympic    |8543       |2930        |0.3429708533302119 |1      |
|2012|Rampart    |8626       |2800        |0.3246000463714352 |2      |
|2012|Harbor     |9441       |2786        |0.29509585848956676|3      |
|2013|Olympic    |8305       |2789        |0.3358217940999398 |1      |
|2013|Rampart    |8148       |2616        |0.32106038291605304|2

### Query 3

In [52]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.sql.types import GeometryType

# Start Spark session
spark = SparkSession.builder \
    .appName("GeoJSON Inspection with Sedona") \
    .config("spark.jars.packages", "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.6.1,org.apache.sedona:sedona-viz-3.0_2.12:1.6.1") \
    .getOrCreate()

# Register Sedona functions
SedonaRegistrator.registerAll(spark)

# Load GeoJSON file
geojson_file_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
geojson_df = spark.read.format("json").load(geojson_file_path)

# Inspect the schema
print("Schema of the GeoJSON file:")
geojson_df.printSchema()

# Show the columns
print("Columns in the GeoJSON file:")
print(geojson_df.columns)

# Optionally, preview some data
geojson_df.show(2, truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Schema of the GeoJSON file:
root
 |-- _corrupt_record: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- BG10: string (nullable = true)
 |    |-- BG10FIP10: string (nullable = true)
 |    |-- BG12: string (nullable = true)
 |    |-- CB10: string (nullable = true)
 |    |-- CEN_FIP13: string (nullable = true)
 |    |-- CITY: string (nullable = true)
 |    |-- CITYCOM: string (nullable = true)
 |    |-- COMM: string (nullable = true)
 |    |-- CT10: string (nullable = true)
 |    |-- CT12: string (nullable = true)
 |    |-- CTCB10: string (nullable = true)
 |    |-- HD_2012: long (nullable = true)
 |    |-- HD_NAME: string (nullable = true)
 |    |-- HOUSING10: long 

In [40]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder \
    .appName("Load CSV and Inspect") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

In [47]:
# Load the CSV file
csv_file_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks_fields.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Inspect columns
print("Columns in the CSV file:")
print(df.columns)

# Inspect schema
print("Schema of the CSV file:")
df.printSchema()

# Show the first two rows
print("First two rows of the CSV file:")
df.show(30, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Columns in the CSV file:
['field', 'type', 'meaning']
Schema of the CSV file:
root
 |-- field: string (nullable = true)
 |-- type: string (nullable = true)
 |-- meaning: string (nullable = true)

First two rows of the CSV file:
+-------------+--------+-------------------------------------------------------------------------+
|field        |type    |meaning                                                                  |
+-------------+--------+-------------------------------------------------------------------------+
|BG10         |string  |7-digit block group number (2010)                                        |
|BG10FIP10    |string  |Combination of BG10 and LA_FIP10 (Los Angeles County FIP code)           |
|BG12         |string  |7-digit block group number (2012)                                        |
|CB10         |string  |4-digit census block number                                              |
|CEN_FIP13    |string  |-                                                      

In [48]:
# Load the CSV file
csv_file_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Inspect columns
print("Columns in the CSV file:")
print(df.columns)

# Inspect schema
print("Schema of the CSV file:")
df.printSchema()

# Show the first two rows
print("First two rows of the CSV file:")
df.show(10, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Columns in the CSV file:
['Zip Code', 'Community', 'Estimated Median Income']
Schema of the CSV file:
root
 |-- Zip Code: integer (nullable = true)
 |-- Community: string (nullable = true)
 |-- Estimated Median Income: string (nullable = true)

First two rows of the CSV file:
+--------+--------------------------------------------------------------------------------------------+-----------------------+
|Zip Code|Community                                                                                   |Estimated Median Income|
+--------+--------------------------------------------------------------------------------------------+-----------------------+
|90001   |Los Angeles (South Los Angeles), Florence-Graham                                            |$33,887                |
|90002   |Los Angeles (Southeast Los Angeles, Watts)                                                  |$30,413                |
|90003   |Los Angeles (South Los Angeles, Southeast Los Angeles)                   

In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, expr

# Start Spark session
spark = SparkSession.builder \
    .appName("Crime and Income Analysis") \
    .config("spark.jars.packages", "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.6.1") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [59]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, expr

# Start Spark session
spark = SparkSession.builder \
    .appName("Crime and Income Analysis") \
    .getOrCreate()

# Load datasets
# Census data
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
census_df = spark.read.format("json").load(geojson_path)

# Income data
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
income_df = spark.read.csv(income_path, header=True, inferSchema=True)

# Crimes data
crimes_path = "s3://path-to/crimes.csv"
crimes_df = spark.read.csv(crimes_path, header=True, inferSchema=True)

# --- STEP 1: Align column names ---
# Census data: ZCTA10, Population
census_selected = census_df.select(
    col("properties.ZCTA10").alias("Zip Code"),
    col("properties.POP_2010").alias("Population")
)

# Income data: Zip Code, Median Household Income
income_selected = income_df.select(
    col("Zip Code"),
    expr("cast(regexp_replace(`Estimated Median Income`, '\\$', '') as double)").alias("Median Income")
)

# Crimes data: Use AREA NAME for mapping
# Assuming AREA NAME maps to ZIP codes (needs verification)
crimes_aggregated = crimes_df.groupBy("AREA NAME").agg(
    count("*").alias("Total Crimes")
).withColumnRenamed("AREA NAME", "Zip Code")

# --- STEP 2: Join datasets ---
# Join Census with Income
census_income_joined = census_selected.join(income_selected, on="Zip Code", how="inner")

# Join the result with Crimes
final_joined = census_income_joined.join(crimes_aggregated, on="Zip Code", how="left").fillna(0)

# --- STEP 3: Perform calculations ---
results = final_joined.withColumn(
    "Average Income Per Person", col("Median Income") / col("Population")
).withColumn(
    "Crime Ratio", col("Total Crimes") / col("Population")
).select(
    col("Zip Code"),
    col("Average Income Per Person"),
    col("Crime Ratio")
)

# --- STEP 4: Output the results ---
results.show(truncate=False)




FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o1332.csv.
: java.io.IOException: com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: The specified bucket does not exist (Service: Amazon S3; Status Code: 404; Error Code: NoSuchBucket; Request ID: 47N9QW5R0CYY9KQK; S3 Extended Request ID: 1LqgmHq+T9qKUTRnlidaMgW9/DpvDk7an0BTbMVy33F/GTckERXYEp0Yf7fMT71i3uweUyN0e2iaoUbfCHvcEA==; Proxy: null), S3 Extended Request ID: 1LqgmHq+T9qKUTRnlidaMgW9/DpvDk7an0BTbMVy33F/GTckERXYEp0Yf7fMT71i3uweUyN0e2iaoUbfCHvcEA==
	at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.list(Jets3tNativeFileSystemStore.java:429)
	at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.isFolderUsingFolderObject(Jets3tNativeFileSystemStore.java:255)
	at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.isFolder(Jets3tNativeFileSystemStore.java:218)
	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.getFileStatus(S3NativeFileSystem.java:554)
	at or

In [60]:
# Load the CSV file
csv_file_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Inspect columns
print("Columns in the CSV file:")
print(df.columns)

# Inspect schema
print("Schema of the CSV file:")
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Columns in the CSV file:
['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON']
Schema of the CSV file:
root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA: integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nulla

### Query 4

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year
import time

def run_query4(spark, config_name):
    # Start timer
    start_time = time.time()

    # Load datasets
    crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
    income_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
    descent_codes_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)

    # Filter Crime Data for Year 2015
    crime_2015_data = crime_data.filter(year("DATE OCC") == 2015)

    # Calculate top 3 and bottom 3 areas by income
    top_areas = income_data.orderBy(col("Estimated Median Income").desc()).limit(3)
    bottom_areas = income_data.orderBy(col("Estimated Median Income")).limit(3)

    # Join descent codes with crime data
    crime_with_descent_data = crime_2015_data.join(
        descent_codes_data, 
        crime_2015_data["Vict Descent"] == descent_codes_data["Vict Descent"], 
        "inner"
    )

    # Join with income data
    top_area_crime = crime_with_descent_data.join(
        top_areas, 
        crime_with_descent_data["AREA NAME"] == top_areas["Community"], 
        "inner"
    )

    bottom_area_crime = crime_with_descent_data.join(
        bottom_areas, 
        crime_with_descent_data["AREA NAME"] == bottom_areas["Community"], 
        "inner"
    )

    # Group by Victim Descent for Top 3 Areas
    top_area_profile = top_area_crime.groupBy("Vict Descent Full").count().orderBy(col("count").desc())

    # Group by Victim Descent for Bottom 3 Areas
    bottom_area_profile = bottom_area_crime.groupBy("Vict Descent Full").count().orderBy(col("count").desc())

    # Show Results
    print(f"Results for {config_name}")
    print("Top 3 Areas by Income - Victim Descent Profile")
    top_area_profile.show()

    print("Bottom 3 Areas by Income - Victim Descent Profile")
    bottom_area_profile.show()
    
    # Stop timer and print elapsed time
    elapsed_time = time.time() - start_time
    print(f"Execution Time for {config_name}: {elapsed_time:.2f} seconds")


# Main Script
if __name__ == "__main__":
    # Configurations
    configurations = [
        ("1 Core, 2GB Memory", 1, "2g"),
        ("2 Cores, 4GB Memory", 2, "4g"),
        ("4 Cores, 8GB Memory", 4, "8g")
    ]

    for config_name, cores, memory in configurations:
        # Create SparkSession with specific configuration
        spark = SparkSession.builder \
            .appName(f"Query4 - {config_name}") \
            .config("spark.executor.instances", "2") \
            .config("spark.executor.cores", cores) \
            .config("spark.executor.memory", memory) \
            .getOrCreate()

        # Run the query
        run_query4(spark, config_name)

        # Stop SparkSession
        spark.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Query 5



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, when, row_number, expr
from pyspark.sql.window import Window
import time

# Start timer
start_time = time.time()

# Start Spark session
spark = SparkSession.builder \
    .appName("Query2 DataFrame API") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Load datasets
crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine both datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

In [3]:
# Load the CSV file
csv_file_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Inspect columns
print("Columns in the CSV file:")
print(df.columns)

# Inspect schema
print("Schema of the CSV file:")
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Columns in the CSV file:
['X', 'Y', 'FID', 'DIVISION', 'LOCATION', 'PREC']
Schema of the CSV file:
root
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- FID: integer (nullable = true)
 |-- DIVISION: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- PREC: integer (nullable = true)

### 2 executors × 4 cores/8GB memory

In [24]:
import time
from sedona.register.geo_registrator import SedonaRegistrator
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, mean, count, min, first

# Start the timer
start_time = time.time()

# Initialize Spark Session with Sedona
spark = SparkSession.builder \
    .appName("GeospatialQuery") \
    .config("spark.jars.packages", "org.apache.sedona:sedona-sql-3.0_2.12:1.6.1") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

# Create Spatial DataFrames
# Police stations use X (longitude) and Y (latitude)
df = df.withColumn("station_geometry", expr("ST_Point(cast(X as Decimal(24, 20)), cast(Y as Decimal(24, 20)))"))

# Crimes use LAT (latitude) and LON (longitude)
crime_data = crime_data.withColumn("crime_geometry", expr("ST_Point(cast(LON as Decimal(24, 20)), cast(LAT as Decimal(24, 20)))"))

# Perform Spatial Join to calculate distances from all crimes to all police stations
df_broadcast = df.select("DIVISION", "station_geometry").cache()

# Calculate distances
distances = crime_data.crossJoin(df_broadcast) \
    .withColumn("distance", expr("ST_Distance(crime_geometry, station_geometry)"))

# Assign each crime to its closest division
closest_crimes = distances.withColumn("rank", expr("ROW_NUMBER() OVER (PARTITION BY DR_NO ORDER BY distance ASC)")) \
    .filter(col("rank") == 1) \
    .select("DR_NO", "distance", "DIVISION")

# Aggregate by division to calculate the number of crimes and average distance
result = closest_crimes.groupBy("DIVISION").agg(
    count("*").alias("number_of_crimes"),
    mean("distance").alias("average_distance")
).orderBy(col("number_of_crimes").desc())

# Show all rows of the result
result.show(result.count(), truncate=False)

# End the timer
end_time = time.time()

# Calculate and display the runtime
runtime = end_time - start_time
print(f"Total execution time: {runtime:.2f} seconds")




FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+--------------------+
|DIVISION        |number_of_crimes|average_distance    |
+----------------+----------------+--------------------+
|HOLLYWOOD       |213080          |0.02043779072548565 |
|VAN NUYS        |211457          |0.028653154590629136|
|WILSHIRE        |198150          |0.026312166557481587|
|SOUTHWEST       |186742          |0.021577001184243143|
|OLYMPIC         |180463          |0.01729162112331338 |
|NORTH HOLLYWOOD |171159          |0.026115214222567722|
|77TH STREET     |167323          |0.016584871496068188|
|PACIFIC         |157468          |0.037495777088312074|
|CENTRAL         |154474          |0.009868086849235298|
|SOUTHEAST       |151999          |0.024150127195506455|
|RAMPART         |149675          |0.014730484635455721|
|TOPANGA         |147167          |0.03243890335156792 |
|WEST VALLEY     |130933          |0.02897360719640746 |
|HARBOR          |130206          |3.2997622866934675  |
|FOOTHILL        |122515       

### 4 executors × 2 cores/4GB memory

In [25]:
import time
from sedona.register.geo_registrator import SedonaRegistrator
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, mean, count, min, first

# Start the timer
start_time = time.time()

# Initialize Spark Session with specified resources
spark = SparkSession.builder \
    .appName("GeospatialQuery") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

# Create Spatial DataFrames
# Police stations use X (longitude) and Y (latitude)
df = df.withColumn("station_geometry", expr("ST_Point(cast(X as Decimal(24, 20)), cast(Y as Decimal(24, 20)))"))

# Crimes use LAT (latitude) and LON (longitude)
crime_data = crime_data.withColumn("crime_geometry", expr("ST_Point(cast(LON as Decimal(24, 20)), cast(LAT as Decimal(24, 20)))"))

# Perform Spatial Join to calculate distances from all crimes to all police stations
df_broadcast = df.select("DIVISION", "station_geometry").cache()

# Calculate distances
distances = crime_data.crossJoin(df_broadcast) \
    .withColumn("distance", expr("ST_Distance(crime_geometry, station_geometry)"))

# Assign each crime to its closest division
closest_crimes = distances.withColumn("rank", expr("ROW_NUMBER() OVER (PARTITION BY DR_NO ORDER BY distance ASC)")) \
    .filter(col("rank") == 1) \
    .select("DR_NO", "distance", "DIVISION")

# Aggregate by division to calculate the number of crimes and average distance
result = closest_crimes.groupBy("DIVISION").agg(
    count("*").alias("number_of_crimes"),
    mean("distance").alias("average_distance")
).orderBy(col("number_of_crimes").desc())

# Show all rows of the result
result.show(result.count(), truncate=False)

# End the timer
end_time = time.time()

# Calculate and display the runtime
runtime = end_time - start_time
print(f"Total execution time: {runtime:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+--------------------+
|DIVISION        |number_of_crimes|average_distance    |
+----------------+----------------+--------------------+
|HOLLYWOOD       |213080          |0.020437790725485655|
|VAN NUYS        |211457          |0.02865315459062913 |
|WILSHIRE        |198150          |0.026312166557481583|
|SOUTHWEST       |186742          |0.021577001184243143|
|OLYMPIC         |180463          |0.01729162112331337 |
|NORTH HOLLYWOOD |171159          |0.026115214222567722|
|77TH STREET     |167323          |0.016584871496068194|
|PACIFIC         |157468          |0.03749577708831207 |
|CENTRAL         |154474          |0.0098680868492353  |
|SOUTHEAST       |151999          |0.024150127195506462|
|RAMPART         |149675          |0.014730484635455718|
|TOPANGA         |147167          |0.03243890335156792 |
|WEST VALLEY     |130933          |0.028973607196407465|
|HARBOR          |130206          |3.2997622866934675  |
|FOOTHILL        |122515       

### 8 executors × 1 core/2 GB memory

In [26]:
import time
from sedona.register.geo_registrator import SedonaRegistrator
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, mean, count, min, first

# Start the timer
start_time = time.time()

# Initialize Spark Session with specified resources
spark = SparkSession.builder \
    .appName("GeospatialQuery") \
    .config("spark.executor.instances", "8") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

# Create Spatial DataFrames
# Police stations use X (longitude) and Y (latitude)
df = df.withColumn("station_geometry", expr("ST_Point(cast(X as Decimal(24, 20)), cast(Y as Decimal(24, 20)))"))

# Crimes use LAT (latitude) and LON (longitude)
crime_data = crime_data.withColumn("crime_geometry", expr("ST_Point(cast(LON as Decimal(24, 20)), cast(LAT as Decimal(24, 20)))"))

# Perform Spatial Join to calculate distances from all crimes to all police stations
df_broadcast = df.select("DIVISION", "station_geometry").cache()

# Calculate distances
distances = crime_data.crossJoin(df_broadcast) \
    .withColumn("distance", expr("ST_Distance(crime_geometry, station_geometry)"))

# Assign each crime to its closest division
closest_crimes = distances.withColumn("rank", expr("ROW_NUMBER() OVER (PARTITION BY DR_NO ORDER BY distance ASC)")) \
    .filter(col("rank") == 1) \
    .select("DR_NO", "distance", "DIVISION")

# Aggregate by division to calculate the number of crimes and average distance
result = closest_crimes.groupBy("DIVISION").agg(
    count("*").alias("number_of_crimes"),
    mean("distance").alias("average_distance")
).orderBy(col("number_of_crimes").desc())

# Show all rows of the result
result.show(result.count(), truncate=False)

# End the timer
end_time = time.time()

# Calculate and display the runtime
runtime = end_time - start_time
print(f"Total execution time: {runtime:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+--------------------+
|DIVISION        |number_of_crimes|average_distance    |
+----------------+----------------+--------------------+
|HOLLYWOOD       |213080          |0.02043779072548566 |
|VAN NUYS        |211457          |0.028653154590629126|
|WILSHIRE        |198150          |0.026312166557481583|
|SOUTHWEST       |186742          |0.021577001184243143|
|OLYMPIC         |180463          |0.017291621123313373|
|NORTH HOLLYWOOD |171159          |0.02611521422256772 |
|77TH STREET     |167323          |0.016584871496068188|
|PACIFIC         |157468          |0.037495777088312074|
|CENTRAL         |154474          |0.009868086849235298|
|SOUTHEAST       |151999          |0.02415012719550645 |
|RAMPART         |149675          |0.014730484635455718|
|TOPANGA         |147167          |0.03243890335156791 |
|WEST VALLEY     |130933          |0.02897360719640747 |
|HARBOR          |130206          |3.299762286693468   |
|FOOTHILL        |122515       